XPATH错误方法未知:contains(@” 的解决办法

本文介绍了如何使用XPath进行精确和模糊查询,包括属性值的查询方法。提供了详细的查询字符串示例,帮助读者快速掌握XPath的使用。
创建xmlDom后加上“xmlDom.setProperty('SelectionLanguage','XPath');
xPath查询如:等值查询:String xPath = "users/user[username='huo' and password='123']";模糊查询:String xPath = "users/user[contains(username,'huo') and contains(password,'123')]";
 ---------------------------------------------------------------------------------------------------------- 
XML第二种存储方式 .xml xPath查询如:加"@" 用以查询属性值等值查询:String xPath = "users/user[@username='huo' and @password='123']";模糊查询:String xPath = "users/user[contains(@username,'huo') and contains(@password,'123')]";
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException, NoSuchElementException import datetime import time import sys import os class EnergySystemLogin: def __init__(self): self.login_url = "http://10.11.20.117:7001/energy4/#/login" self.credentials = {'username': 'E915285', 'password': '123456'} self.driver = None self.wait_timeout = 30 # 元素等待超时时间 def initialize_driver(self): """初始化Chrome浏览器(无头模式 + 支持文件下载)""" try: options = webdriver.ChromeOptions() # --- 基础反检测设置 --- options.add_argument("--disable-blink-features=AutomationControlled") options.add_argument("--start-maximized") options.add_argument("--headless=new") # 新版无头模式 options.add_argument("--disable-gpu") options.add_argument("--no-sandbox") options.add_argument("--disable-dev-shm-usage") options.add_argument("--window-size=1920,1080") options.add_argument("--log-level=3") options.add_experimental_option("excludeSwitches", ["enable-automation"]) options.add_experimental_option("useAutomationExtension", False) # === 👇 关键:添加下载配置 === download_dir = r"D:\Class\能管比对\导出文件" # ← 修改为你想保存的路径! if not os.path.exists(download_dir): os.makedirs(download_dir) prefs = { "download.default_directory": download_dir, # 必须用绝对路径 "download.prompt_for_download": False, # 不提示选择路径 "download.directory_upgrade": True, # 自动升级目录权限 "safebrowsing.enabled": True, # 启用安全浏览(避免警告) "profile.default_content_settings.popups": 0, # 禁止弹窗 "plugins.always_open_pdf_externally": False, # PDF 外部打开(非必须) } options.add_experimental_option("prefs", prefs) # === ✅ 配置结束 === self.driver = webdriver.Chrome(options=options) self.driver.implicitly_wait(10) # 防止被网站检测为机器人 self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => false});") # 记录日志 self.log_action(f"浏览器初始化完成,下载路径: {download_dir}") return True except Exception as e: self.log_error(f"浏览器初始化失败: {str(e)}") return False def log_action(self, message): timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(f"[{timestamp}] {message}") def log_error(self, message): timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') self.log_action(f"错误: {message}") if self.driver: try: self.driver.save_screenshot(f"error_{timestamp}.png") except Exception as e: print(f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] 截图失败: {str(e)}") def locate_and_input(self, by, value, text, description): """定位输入框并输入内容""" try: element = WebDriverWait(self.driver, self.wait_timeout).until( EC.visibility_of_element_located((by, value)) ) element.clear() element.send_keys(text) self.log_action(f"{description} 输入完成") return True except Exception as e: self.log_error(f"{description} 输入失败: {str(e)}") return False def perform_login(self): """执行登录流程""" try: self.log_action("访问登录页面") self.driver.get(self.login_url) # 等待页面加载完成 WebDriverWait(self.driver, self.wait_timeout).until( lambda d: d.execute_script("return document.readyState") == "complete" ) # 输入用户名 if not self.locate_and_input( By.CSS_SELECTOR, "input.el-input__inner[type='text']", self.credentials['username'], "用户名" ): return False # 输入密码 if not self.locate_and_input( By.CSS_SELECTOR, "input.el-input__inner[type='password']", self.credentials['password'], "密码" ): return False # 点击登录按钮 try: login_btn = WebDriverWait(self.driver, self.wait_timeout).until( EC.element_to_be_clickable((By.CSS_SELECTOR, "button.loginBtn")) ) login_btn.click() self.log_action("点击登录按钮") except Exception as e: self.log_error(f"点击登录按钮失败: {str(e)}") return False # 验证是否登录成功(URL变化且不再有登录容器) try: WebDriverWait(self.driver, 20).until( EC.url_contains("/energy4/#/") # 登录后跳转到主页面 ) WebDriverWait(self.driver, 10).until_not( EC.presence_of_element_located((By.CLASS_NAME, "login-container")) ) self.log_action("登录成功") self.driver.save_screenshot("login_success.png") return True except TimeoutException: self.log_error("登录验证超时,可能未成功登录") return False except Exception as e: self.log_error(f"登录过程发生异常: {str(e)}") return False def get_driver(self): return self.driver def cleanup(self): if self.driver: try: self.driver.quit() except Exception as e: print(f"清理浏览器资源时出错: {str(e)}") class ACSystemOperator: def __init__(self, driver): self.driver = driver self.wait_timeout = 10 self.click_delay = 3 # 每次点击后等待3秒 def get_timestamp(self): return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') def log_action(self, message): print(f"[{self.get_timestamp()}] 操作: {message}") # 删除此处的截图:不再每步都截图 def log_error(self, message): print(f"[{self.get_timestamp()}] 错误: {message}") # 错误时仍保留截图 self.driver.save_screenshot(f"error_{int(time.time())}.png") def click_element(self, xpath, description): """直接点击元素,仅尝试一次,失败则记录错误""" try: element = WebDriverWait(self.driver, self.wait_timeout).until( EC.visibility_of_element_located((By.XPATH, xpath)) ) element.click() self.log_action(f"点击成功: {description}") time.sleep(self.click_delay) # 固定等待3秒 return True except Exception as e: self.log_error(f"点击失败 ({description}): {str(e)}") return False def ACoperate(self): """依次点击菜单项进入‘低温冰水制冷功率’""" operations1 = [ {"xpath": "//span[@slot='title'][text()='能耗统计']", "desc": "能耗统计"}, {"xpath": "//span[text()='分类统计' and contains(@data-v-11708afe,'')]", "desc": "分类统计"}, {"xpath": "//div[@role='button' and contains(@class,'el-collapse-item__header')][normalize-space()='系统分组']", "desc": "系统分组"}, {"xpath": "//span[@data-v-4fe475b6 and @title='空调系统' and normalize-space()='空调系统']", "desc": "空调系统"}, {"xpath": "//span[@data-v-4fe475b6 and @title='冷热水系统' and normalize-space()='冷热水系统']", "desc": "冷热水系统"}, {"xpath": "//div[contains(@class, 'el-tree-node__content') and contains(@style, 'padding-left: 36px')]//span[contains(@title, '低温冰水系统')]", "desc": "低温冰水系统"}, {"xpath": "//div[contains(@style, 'padding-left: 54px')]//span[contains(@title, '低温冰水制冷功率') and @data-v-4fe475b6]", "desc": "低温冰水制冷功率"}, {"xpath": "//div[contains(@style, 'padding-left: 72px')]//span[contains(@title, '低温冰水制冷功率') and @data-v-4fe475b6]", "desc": "子项 - 低温冰水制冷功率"}, {"xpath": "//span[@class='el-radio-button__inner' and normalize-space()='月']", "desc": "月"}, {"xpath": "//span[@class='el-radio-button__inner' and normalize-space()='能源种类']", "desc": "能源种类"}, {"xpath": "//button[contains(@class, 'el-button') and contains(span/text(), '查 询')]", "desc": "查询"}, {"xpath": "//button[contains(@class, 'el-button') and contains(span/text(), '导出Excel')]", "desc": "导出"}, ] for op in operations1: if not self.click_element(op["xpath"], op["desc"]): self.log_error(f"操作中断于: {op['desc']}") return False # ✅ 所有操作完成,现在等待3秒让页面加载(比如图表、Ajax 数据) self.log_action("所有点击已完成,等待3秒以便页面加载...") time.sleep(3) # ✅ 然后进行最终截图 timestamp = int(time.time()) screenshot_name = f"final_result_{timestamp}.png" self.driver.save_screenshot(screenshot_name) self.log_action(f"✅ 最终截图已保存: {screenshot_name}") return True operations2 = [ {"xpath": "//span[@slot='title'][text()='能耗统计']", "desc": "能耗统计"}, {"xpath": "//span[text()='分类统计' and contains(@data-v-11708afe,'')]", "desc": "分类统计"}, {"xpath": "//div[@role='button' and contains(@class,'el-collapse-item__header')][normalize-space()='系统分组']","desc": "系统分组"}, {"xpath": "//span[@data-v-4fe475b6 and @title='空调系统' and normalize-space()='空调系统']","desc": "空调系统"}, {"xpath": "//span[@data-v-4fe475b6 and @title='冷热水系统' and normalize-space()='冷热水系统']","desc": "冷热水系统"}, {"xpath": "//div[contains(@class, 'el-tree-node__content') and contains(@style, 'padding-left: 36px')]//span[contains(@title, '低温冰水系统')]","desc": "低温冰水系统"}, {"xpath": "//div[contains(@style, 'padding-left: 54px')]//span[contains(@title, '低温冰水制冷功率') and @data-v-4fe475b6]","desc": "低温冰水制冷功率"}, {"xpath": "//div[contains(@style, 'padding-left: 72px')]//span[contains(@title, '低温冰水制冷功率') and @data-v-4fe475b6]","desc": "子项 - 低温冰水制冷功率"}, {"xpath": "//span[@class='el-radio-button__inner' and normalize-space()='月']", "desc": "月"}, ] for op in operations1: if not self.click_element(op["xpath"], op["desc"]): self.log_error(f"操作中断于: {op['desc']}") return False # ✅ 所有操作完成,现在等待3秒让页面加载(比如图表、Ajax 数据) self.log_action("所有点击已完成,等待3秒以便页面加载...") time.sleep(3) # ✅ 然后进行最终截图 timestamp = int(time.time()) screenshot_name = f"final_result_{timestamp}.png" self.driver.save_screenshot(screenshot_name) self.log_action(f"✅ 最终截图已保存: {screenshot_name}") return True # ======================== # 主程序入口 # ======================== if __name__ == "__main__": login = None try: login = EnergySystemLogin() # 初始化浏览器 if not login.initialize_driver(): print("❌ 浏览器初始化失败,程序终止") sys.exit(1) # 执行登录 if not login.perform_login(): print("❌ 登录失败,程序终止") sys.exit(1) # 获取已登录的 driver driver = login.get_driver() # 执行后续菜单操作 operator = ACSystemOperator(driver) if operator.ACoperate(): print("🎉 操作执行成功!") else: print("⚠️ 操作执行失败,请查看错误日志和截图") except KeyboardInterrupt: print("\n\n🛑 用户手动中断程序") except Exception as e: print(f"🚨 程序发生未预期异常: {str(e)}") finally: if login: login.cleanup() print("🔚 程序结束") 帮我对改代码进行修改,除了operations1,后续还会添加 operations2、3、4、等等
最新发布
10-30
import requests from lxml import etree import csv import time import random import re import os # 固定请求头配置(需替换为实际值) FIXED_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36 Edg/137.0.0.0' FIXED_COOKIE = 'll="118161"; bid=l5ki4SOlbBM; dbcl2="244638424:dLHXPIU8S0M"; push_noty_num=0; push_doumail_num=0; __utmv=30149280.24463; _pk_id.100001.4cf6=42b01014d5c31947.1748938983.; _vwo_uuid_v2=D9E78C6D9D4E71BBB6EC73B8583864961|9da3be87da4a6d3be6203809b085d4a9; __yadk_uid=2Zr6yzTnllQxMzDhrQB82h7doa8gM4Ku; ck=ILlj; ap_v=0,6.0; frodotk_db="dd9f2f5023b9a95198dd8df06b2cfbf3"; __utma=30149280.1697373246.1748938900.1749979892.1750127075.8; __utmc=30149280; __utmz=30149280.1750127075.8.6.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=30149280.2.10.1750127075; __utma=223695111.238348316.1748938983.1749979902.1750127082.8; __utmb=223695111.0.10.1750127082; __utmc=223695111; __utmz=223695111.1750127082.8.7.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1750127082%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=1' # 需从浏览器获取 # 基础URL和请求头 base_url = "https://movie.douban.com/subject/27181010/reviews" headers = { 'User-Agent': FIXED_USER_AGENT, 'Cookie': FIXED_COOKIE, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Referer': 'https://movie.douban.com/subject/27181010/', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-User': '?1', } # 创建输出目录 os.makedirs('douban_data', exist_ok=True) # 创建CSV文件并写入表头 csv_file = open('douban_data/douban_reviews_fixed.csv', 'w', newline='', encoding='utf-8-sig') writer = csv.writer(csv_file) writer.writerow(['昵称', '星级评分', '评论时间', '文本评论', '点赞数', '回应数', '页面位置']) def extract_content(element): """提取评论内容,处理展开情况""" # 尝试提取短评内容 short_content = element.xpath('.//div[contains(@class, "short-content")]/text()') if short_content: return ''.join(short_content).strip() # 尝试提取完整评论内容 full_content = element.xpath('.//div[@class="review-content clearfix"]/text()') if full_content: return ''.join(full_content).strip() # 尝试提取折叠内容 folded_content = element.xpath('.//div[@class="folded"]/text()') if folded_content: return ''.join(folded_content).strip() return "无内容" # 爬取多页数据 for page in range(0, 125): # 构造URL参数 params = { 'start': page * 20, 'sort': 'new_score', 'status': 'P' } try: print(f"开始爬取第 {page+1} 页...") # 发送请求(禁止重定向以检测验证) response = requests.get( url=base_url, params=params, headers=headers, timeout=15, allow_redirects=False # 禁止重定向以检测验证 ) # 检查重定向状态码(302表示需要验证) if response.status_code == 302: location = response.headers.get('Location', '未知位置') print(f"⚠️ 第 {page+1} 页触发验证,重定向至: {location}") # 保存重定向页面供分析 with open(f'douban_data/redirect_page_{page}.html', 'w', encoding='utf-8') as f: f.write(response.text) continue response.encoding = 'utf-8' if response.status_code == 200: # 解析HTML html = etree.HTML(response.text) # 检查是否有验证码提示 captcha = html.xpath('//input[@name="captcha-id"]') if captcha: print(f"⚠️ 第 {page+1} 页需要验证码,跳过") # 保存验证码页面供分析 with open(f'douban_data/captcha_page_{page}.html', 'w', encoding='utf-8') as f: f.write(response.text) continue # 检查页面是否包含评论容器(使用更灵活的选择器) review_container = html.xpath('//div[@class="review-list"]') if not review_container: # 尝试备用选择器 review_container = html.xpath('//div[contains(@id, "content")]//div[contains(@class, "review")]') if not review_container: # 保存异常页面用于分析 with open(f'douban_data/error_page_{page}.html', 'w', encoding='utf-8') as f: f.write(response.text) print(f"❌ 第 {page+1} 页无评论容器,已保存页面供分析") continue # 提取评论项(更新后的选择器) comments = html.xpath('//div[contains(@class, "review-item")]') # 备用选择器:尝试抓取评论项 if not comments: comments = html.xpath('//div[contains(@class, "main") and contains(@class, "review-item")]') if not comments: comments = html.xpath('//div[@class="review-list"]/div[contains(@class, "review")]') if not comments: print(f"❌ 第 {page+1} 页找到0条评论,可能触发反爬") # 检查反爬提示 anti_spider = html.xpath('//div[contains(text(), "检测到异常请求")]') if anti_spider: print("⚠️ 检测到反爬提示,请更换Cookie或IP") # 保存页面供分析 with open(f'douban_data/antispider_page_{page}.html', 'w', encoding='utf-8') as f: f.write(response.text) continue print(f"✅ 第 {page+1} 页找到 {len(comments)} 条评论") for idx, comment in enumerate(comments): try: # 提取昵称 username = comment.xpath('.//a[contains(@class, "name")]/text()') if not username: username = comment.xpath('.//span[@class="author"]/a/text()') username = username[0].strip() if username else "无昵称" # 提取星级评分 rating = comment.xpath('.//span[contains(@class, "rating")]/@title') if not rating: rating = comment.xpath('.//span[contains(@class, "main-title-rating")]/@title') rating = rating[0] if rating else "无评分" # 提取评论时间 comment_time = comment.xpath('.//span[contains(@class, "main-meta")]/text()') if not comment_time: comment_time = comment.xpath('.//span[@class="review-date"]/text()') comment_time = comment_time[0].strip() if comment_time else "无时间" # 提取文本评论 content = extract_content(comment) # 提取点赞数 useful_count = comment.xpath('.//button[contains(@data-count, "useful")]/span/text()') if not useful_count: useful_count = comment.xpath('.//button[contains(@class, "useful")]/span/text()') useful_count = useful_count[0].strip() if useful_count else "0" # 提取回应数 reply_count = comment.xpath('.//a[contains(@class, "replythis")]/text()') if not reply_count: reply_count = comment.xpath('.//span[contains(@class, "reply-count")]/a/text()') reply_count = re.sub(r'\D', '', reply_count[0]) if reply_count else "0" # 写入CSV writer.writerow([ username, rating, comment_time, content, useful_count, reply_count, f"第{page+1}页第{idx+1}条" ]) except Exception as e: print(f"⚠️ 处理评论时出错: {e}") continue else: print(f"❌ 请求失败,状态码: {response.status_code}") except Exception as e: print(f"❌ 请求异常: {e}") # 随机延迟,避免频繁请求 delay = random.uniform(3, 8) print(f"⏳ 等待 {delay:.2f} 秒后继续...") time.sleep(delay) csv_file.close() print("✅ 爬取完成!数据已保存到 douban_data/douban_reviews_fixed.csv"),帮我在此代码基础上修改提取评论时间、点赞数和回应数相应代码,使得能够输出相应内容,并添加负赞数这一数据列
06-19
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值