2个版本
0.1.1 | 2021年2月22日 |
---|---|
0.1.0 | 2021年2月12日 |
#10 in #twilio
760KB
285 代码行(不包括注释)
Lookout
一个异步抓取工具。给定一个描述网站、选择器、正则表达式和预期正则表达式匹配的TOML文件,当预期匹配未满足时,它将连续抓取并使用Twilio给你的电话拨打电话或发送短信(如果已配置)。
例如,这可以抓取产品库存,并在页面上不再找到预期的“缺货”元素时给你打电话。
安装
cargo install lookout
使用方法
默认情况下,lookout将从运行它的目录加载lookout.toml
。
lookout
你可以选择传递一个参数,指定配置文件的路径。
lookout ~/my-lookout.toml
如果你在服务器上部署并且想要打印日志,请设置环境变量RUST_LOG=lookout
。
RUST_LOG=lookout lookout
为了调试抓取器,在配置中打开log
并使用RUST_LOG=lookout
运行。
免责声明:在抓取时始终遵守限制/法规。下面的例子仅用于说明。
示例
log = false
# in seconds
error_delay = 300 # timeout 5 minutes after error
alert_delay = 300 # timeout 5 minutes after an alert
# upper bound on errors to prevent unbounded retries
allowed_errors = 20
# twilio configuration
[twilio]
enable_call = true
enable_text = true
id = "XXXXXXXXXXXXXXXX"
auth = "XXXXXXXXXXXXXXXX"
to_phone = "+10123456789"
from_phone = "+10123456789"
default_callback = "https://<callback-url>"
# define a [[lookout]] block for each scraper
# required:
# - name
# - url
# - regex to search for
# - number of expected matches
# - timeout after each request
# optional:
# - list of selectors which will be used to filter the html
# - headers (see below)
[[lookout]]
name = "newegg"
url = "https://www.newegg.com/product"
regex = "OUT OF STOCK|SOLD OUT|Sold Out"
expected_matches = 3
selectors = [ "#app > div.page-content > div.page-section > div > div > div.row-side > div.product-buy-box",
"#app > div.page-content > div.page-section > div > div > div.row-body > div.product-main.display-flex > div" ]
timeout = 180
[[lookout]]
name = "b&h"
url = "https://www.bhphotovideo.com/some/cool/product"
regex = "Notify When Available"
expected_matches = 1
timeout = 180
[[lookout]]
name = "bestbuy"
url = "https://www.bestbuy.com/site/something/"
regex = "Sold Out"
expected_matches = 2
timeout = 180
# optionally, add custom request headers for the scraper above
[lookout.headers]
"User-Agent" = "Wget/1.20.1 (linux-gnu)"
"Accept-Encoding" = "identity"
"ACCEPT" = "*/*"
"Connection" = "Keep-Alive"
依赖项
~21–37MB
~598K SLoC