思路
首先读取各公司url使用Selenium访问模拟用户浏览器行为,然后使用xpath定位所需元素,最后保存至文件中
问题
同一个ip访问次数过多会被403
解决:配置动态ip代理池
Selenium无法使用有账号密码认证的代理池
解决:Selenium 如何使用代理 IP 进行 Web 爬虫(无认证实现、有账号密码认证实现)-腾讯云开发者社区-腾讯云 (tencent.com)
def create_proxyauth_extension(proxy_host, proxy_port, proxy_username, proxy_password, scheme='http', plugin_path='Selenium-Chrome-HTTP-Private-Proxy.zip'):
"""创建代理认证扩展"""
manifest_json = """
{
"version": "1.0.0",
"manifest_version": 2,
"name": "Chrome Proxy",
"permissions": [
"proxy",
"tabs",
"unlimitedStorage",
"storage",
"<all_urls>",
"webRequest",
"webRequestBlocking"
],
"background": {
"scripts": ["background.js"]
},
"minimum_chrome_version":"22.0.0"
}
"""
background_js = f"""
var config = {
{
mode: "fixed_servers",
rules: {
{
singleProxy: {
{
scheme: "{scheme}",
host: "{proxy_host}",
port: parseInt({proxy_port})
}},
bypassList: ["foobar.com"]
}}
}};
chrome.proxy.settings.set({
{value: config, scope: "regular"}}, function() {
{}});
function callbackFn(details) {
{
return {
{
authCredentials: {
{
username: "{proxy_username}",
password: "{proxy_password}"
}}
}};
}}
chrome.webRequest.onAuthRequired.addListener(
callbackFn,
{
{urls: ["<all_urls>"]}},
['blocking']
);
"""
with zipfile.ZipFile(plugin_path, 'w') as zp:
zp.writestr("manifest.json", manifest_json)
zp.writestr("background.js", background_js)
return plugin_path
def configure_headless_browser(proxy_config):
"""配置并返回一个带代理设置的Chrome浏览器"""
chrome_options = Options()
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
prefs = {"profile.managed_default_content_settings.images": 2} # 禁用图片加载
chrome_options.add_experimental_option("prefs", prefs)
# 创建代理认证扩展
proxyauth_plugin_path = create_proxyauth_extension(
proxy_host=proxy_config[0],
proxy_port=proxy_config[1],
proxy_username=proxy_config[2],
proxy_password=proxy_config[3]
)
chrome_options.ad