2个版本

0.1.1	2021年2月22日
0.1.0	2021年2月12日

#10 in #twilio

MIT 许可证

760KB
285 代码行（不包括注释）

Lookout

一个异步抓取工具。给定一个描述网站、选择器、正则表达式和预期正则表达式匹配的TOML文件，当预期匹配未满足时，它将连续抓取并使用Twilio给你的电话拨打电话或发送短信（如果已配置）。

例如，这可以抓取产品库存，并在页面上不再找到预期的“缺货”元素时给你打电话。

lookout screenshot

安装

cargo install lookout

使用方法

默认情况下，lookout将从运行它的目录加载lookout.toml。

lookout

你可以选择传递一个参数，指定配置文件的路径。

lookout ~/my-lookout.toml

如果你在服务器上部署并且想要打印日志，请设置环境变量RUST_LOG=lookout。

RUST_LOG=lookout lookout

为了调试抓取器，在配置中打开log并使用RUST_LOG=lookout运行。

免责声明：在抓取时始终遵守限制/法规。下面的例子仅用于说明。

示例

log = false

# in seconds
error_delay = 300 # timeout 5 minutes after error
alert_delay = 300 # timeout 5 minutes after an alert

# upper bound on errors to prevent unbounded retries
allowed_errors = 20 

# twilio configuration
[twilio]
enable_call = true
enable_text = true
id = "XXXXXXXXXXXXXXXX"
auth = "XXXXXXXXXXXXXXXX"
to_phone = "+10123456789"
from_phone = "+10123456789"
default_callback = "https://<callback-url>"

# define a [[lookout]] block for each scraper
# required:
# - name
# - url
# - regex to search for
# - number of expected matches
# - timeout after each request
# optional:
# - list of selectors which will be used to filter the html
# - headers (see below)
[[lookout]]
name = "newegg"
url = "https://www.newegg.com/product"
regex = "OUT OF STOCK|SOLD OUT|Sold Out"
expected_matches = 3
selectors = [ "#app > div.page-content > div.page-section > div > div > div.row-side > div.product-buy-box",
              "#app > div.page-content > div.page-section > div > div > div.row-body > div.product-main.display-flex > div" ]
timeout = 180

[[lookout]]
name = "b&h"
url = "https://www.bhphotovideo.com/some/cool/product"
regex = "Notify When Available"
expected_matches = 1
timeout = 180

[[lookout]]
name = "bestbuy"
url = "https://www.bestbuy.com/site/something/"
regex = "Sold Out"
expected_matches = 2
timeout = 180

# optionally, add custom request headers for the scraper above
[lookout.headers]
"User-Agent" = "Wget/1.20.1 (linux-gnu)"
"Accept-Encoding" = "identity"
"ACCEPT" = "*/*"
"Connection" = "Keep-Alive"

依赖项

~21–37MB
~598K SLoC