webrpa 是一个分布式的网络爬虫系统,基于 fastapi+fastadmin 开发,通过 web api 接口发起网络爬虫服务,实现流程自动化或数据自动抓取。它包含两部分:
graph LR
client-->manager-->worker1
manager-->worker2
manager-->workers[worker...]
主要实现的功能包括:
引入 browser use ,通过 LLM 自动创建数据爬虫服务。
{
"name": "szreorc",
"desc": "深圳不动产查询",
"driver": "firefox",
"url": "",
"debug": true,
"window_size": "1920x1080",
"action_timeout": 5,
"wait_redirect": true,
"wait_redirect_interval": 2,
"identifier": "{username}-{BuildingName}-{UNIT_NO}",
"credential": "{username}",
"actions": {
"1": {
"desc": "确认登录",
"action": "check_variable",
"options": {"script": "return window.location.href;",
"target": "^https://pnr.sz.gov.cn/d-ghrer/reroosp/ytcf"
}
},
"10": {
"desc" : "用户名密码登录",
"action": "click",
"timeout": 2,
"target": ["xpath", "//a[contains(@class, 'login-tab') and normalize-space(text())='账号密码']"]
},
"11": {
"desc" : "输入用户名",
"action": "input_text",
"target": ["xpath", "//input[@type='text' and @placeholder='请输入账号']"],
"param": "username"
},
"12": {
"desc": "增加计数",
"action": "variable",
"options": {"variable":"counter1","operator": "+"}
},
"13": {
"desc": "检测计数",
"action": "variable",
"stop_on_fail": true,
"options": {"variable":"counter1","operator": "<", "target": 2, "sleep": 2000}
},
"14": {
"desc" : "输入密码",
"action": "input_text",
"target": ["xpath", "//input[@type='password' and @placeholder='请输入密码']"],
"param": "password"
},
"15": {
"desc": "识别 captcha",
"action": "decode_captcha_code",
"target": ["xpath","//div[contains(@class, 'captcha-body') and @title='点击刷新']"],
"options": {"code_type": 11}
},
"16": {
"desc": "输入 captcha",
"action": "input_text",
"target": ["xpath","//div[contains(@class, 'account_verifying')] //input[@type='text']"]
},
"17": {
"desc": "点击登录",
"action": "click",
"target": ["xpath", "//button[contains(@class, 'gd-btn-primary') and contains(@class, 'gd-btn') and @type='button']//span[starts-with(text(), '登录 ')]"]
},
"18": {
"desc": "继续登录",
"action": "click",
"target": ["xpath", "//button[.//span[contains(text(), '继续登录')]]"]
},
"20": {
"desc": "确认选择",
"action": "click",
"timeout": 10,
"stop_on_fail": true,
"fail_message": "login failed",
"options": {"set_credential": true},
"target": ["class name", "jinruxuzhi-checkbox"]
},
"21": {
"desc": "确认选择下一步",
"action": "click",
"target": ["class name", "jinruxuzhi-buttonOk"]
},
"30": {
"desc": "展开查询类型",
"action": "click",
"options": {"sleep": 2},
"target": ["xpath", "//input[@type='text' and @placeholder='请选择']"]
},
"31": {
"desc": "等待下拉菜单",
"action": "wait_element",
"options": {"visible": true},
"target": ["css selector", "div.el-select-dropdown.el-popper"]
},
"32": {
"desc": "选择查询类型",
"action": "click",
"target": ["xpath", "//li[contains(@class, 'el-select-dropdown__item') and span[text()='楼名及栋名']]"]
},
"33": {
"desc" : "输入查询内容",
"action": "input_text",
"target": ["xpath", "//input[@type='text' and @placeholder='请输入内容']"],
"param": "BuildingName"
},
"34": {
"desc": "点击查询",
"action": "click",
"target": ["class name", "el-icon-search"]
},
"35": {
"desc": "点击截图对象",
"action": "click",
"timeout": 20,
"stop_on_fail": true,
"fail_message": "search failed",
"target": ["xpath", "//div[contains(@class, 'el-dialog__wrapper')]//div[contains(@class, 'el-tabs__item') and normalize-space(text())='楼宇']"]
},
"40": {
"desc": "获取数据",
"action": "get_data",
"options": {"script": "var table = document.querySelector(\"#pane-1 table.is-bordered.el-descriptions--mini\");\nvar fields = [\"土地坐落\", \"楼名及栋名\", \"房屋类型\", \"房屋性质\", \"房屋用途\"];\nvar result = {};\nif (table) {\n var rows = table.querySelectorAll(\"tr.el-descriptions-row\");\n rows.forEach(function(row) {\n var label = row.querySelector(\"th.el-descriptions-item__label\").innerText.trim();\n var content = row.querySelector(\"td.el-descriptions-item__content\").innerText.trim();\n if (fields.includes(label)) {\n result[label] = content;\n }\n });\n console.log(JSON.stringify(result));\n} else {\n console.log(\"Table not found.\");\n};\nreturn result;\n"}
},
"41": {
"desc": "点击截图对象",
"action": "click",
"target": ["xpath", "//div[contains(@class, 'el-dialog__wrapper')]//div[contains(@class, 'el-tabs__item') and normalize-space(text())='房屋']"]
},
"42": {
"desc": "下拉房屋查询",
"action": "click",
"target": ["css selector", "#pane-2 input.el-input__inner"]
},
"43": {
"desc": "点击房屋查询",
"action": "click",
"target": ["xpath", "//li[contains(@class, 'el-select-dropdown__item')]//span[text()='{UNIT_NO}']"],
"param": "UNIT_NO"
},
"44": {
"desc": "截图",
"action": "screenshot",
"target": ["class name", "el-dialog__wrapper"],
"options": {"visible": true}
},
"45": {
"desc": "获取数据",
"action": "get_data",
"options": {"script": "var table = document.querySelector(\"#pane-2 table.is-bordered.el-descriptions--mini\");\nvar fields = [\"房号\", \"所在楼层\", \"建筑面积\", \"使用年限\", \"存在抵押\", \"存在查封\", \"存在异议\", \"存在居住权\"];\nvar result = {};\nif (table) {\n var rows = table.querySelectorAll(\"tr.el-descriptions-row\");\n rows.forEach(function(row) {\n var label = row.querySelector(\"th.el-descriptions-item__label\").innerText.trim();\n var content = row.querySelector(\"td.el-descriptions-item__content\").innerText.trim();\n if (fields.includes(label)) {\n result[label] = content;\n }\n });\n console.log(JSON.stringify(result));\n} else {\n console.log(\"Table not found.\");\n};\nreturn result;\n"}
}
},
"processes": "start->1\n1(no)->10->11\n11(no)->12->13\n13(yes)->10\n11(yes)->14->15->16->17->18->20->21->30->31->32->33->34->35->40->41->42->43->44->45->end\n1(yes)->20",
"result":["screenshot", "data"]
}