input标签file类型,去除不同浏览器“未选择文件”字样办法

本文介绍了一种通过HTML和JavaScript结合的方式,实现文件输入框选择文件后的提示文本更新,以覆盖默认的“未选择文件”提示。

之前的样式,这个“未选择文件”很想把它去掉吧?


这段代码的运行结果,去掉了“未选择文件”




<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<title>无标题文档</title>
<style>
#file{ width:226px; height:25px;  position:relative;}
#filetxt{}
#filebtn{ position:absolute; right:0px; }
#upfile{ width:70px; position:absolute; right:0px; top:0px; opacity:0;}
</style>
<script src="http://apps.bdimg.com/libs/jquery/2.1.4/jquery.min.js" ></script>
</head>

<body>
<div id="file">
        <input id="filetxt" type="text" value="未选择文件" />
          <input id="filebtn" type="button" value="SELECT"/>
        <input id='upfile' type="file" />
     </div>
 
     <script>
        $(function(){
$('#upfile').change(function(){
$('#filetxt').val($('#upfile').val());
});
});
    </script>
</body>
</html>

from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.keys import Keys from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException from bs4 import BeautifulSoup import requests import re import time import csv import random import os import sys from aip import AipNlp from webdriver_manager.chrome import ChromeDriverManager # 百度AI配置 APP_ID = 'YOUR_APP_ID' API_KEY = 'YOUR_API_KEY' SECRET_KEY = 'YOUR_SECRET_KEY' client = AipNlp(APP_ID, API_KEY, SECRET_KEY) # 随机请求头 USER_AGENTS = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.6422.142 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:126.0) Gecko/20100101 Firefox/126.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Safari/605.1.15" ] def get_driver(): """创建并配置浏览器驱动""" chrome_options = Options() # 调试时注释掉无头模式 # chrome_options.add_argument("--headless") chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("--window-size=1920,1080") chrome_options.add_argument(f"user-agent={random.choice(USER_AGENTS)}") chrome_options.add_argument("--disable-blink-features=AutomationControlled") chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) chrome_options.add_experimental_option('useAutomationExtension', False) # 自动下载并管理ChromeDriver service = Service(ChromeDriverManager().install()) driver = webdriver.Chrome(service=service, options=chrome_options) # 隐藏自动化特征 driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") return driver def jd_book_crawler(search_keyword, max_pages=2): """京东图书爬虫函数 - 增强健壮性""" print("启动浏览器...") driver = get_driver() books_data = [] try: print(f"访问京东首页...") driver.get("https://www.jd.com") time.sleep(random.uniform(1, 3)) print("京东首页加载成功") # 搜索图书 print(f"搜索关键词: {search_keyword}") try: # 多种定位搜索框的方式 search_box = WebDriverWait(driver, 15).until( EC.element_to_be_clickable((By.ID, "key")) ) except TimeoutException: # 备选定位方式 search_box = WebDriverWait(driver, 15).until( EC.element_to_be_clickable((By.CSS_SELECTOR, "input.search-key")) ) # 模拟真实输入 search_box.clear() for char in search_keyword: search_box.send_keys(char) time.sleep(random.uniform(0.05, 0.15)) # 提交搜索 search_box.send_keys(Keys.ENTER) print("已提交搜索") time.sleep(random.uniform(2, 4)) # 验证是否跳转到搜索结果页 try: WebDriverWait(driver, 15).until( EC.url_contains("search") ) print("已进入搜索结果页") except TimeoutException: print("可能遇到验证码或反爬页面,尝试重新加载...") driver.get(f"https://search.jd.com/Search?keyword={search_keyword}") time.sleep(random.uniform(3, 5)) # 等待结果加载 print("等待商品列表加载...") try: # 多种等待商品加载的方式 WebDriverWait(driver, 20).until( EC.presence_of_element_located((By.CSS_SELECTOR, ".gl-item, .goods-list-v2 .item, .j-sku-item")) ) time.sleep(random.uniform(1, 2)) except TimeoutException: print("商品加载超时,尝试备用加载方式...") # 尝试滚动页面触发加载 driver.execute_script("window.scrollTo(0, document.body.scrollHeight/3);") time.sleep(2) driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);") time.sleep(2) driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(3) for page in range(1, max_pages + 1): print(f"正在处理第 {page} 页") # 获取页面源码 html = driver.page_source soup = BeautifulSoup(html, 'html.parser') # 多种商品列表选择器 items = soup.select('.gl-item') # 京东主站 if not items: items = soup.select('.goods-list-v2 .item') # 图书频道 if not items: items = soup.select('.j-sku-item') # 备用选择器 if not items: print("警告:找到商品元素,尝试保存页面源码以供分析") with open(f"jd_page_{page}.html", "w", encoding="utf-8") as f: f.write(html) print("页面源码已保存") continue print(f"本页找到 {len(items)} 个商品") for idx, item in enumerate(items): try: # 多种标题选择器 title_elem = item.select_one('.p-name a em') or \ item.select_one('.p-name a') or \ item.select_one('.p-name-type-2 a') or \ item.select_one('.name a') title = title_elem.text.strip() if title_elem else "N/A" # 多种价格选择器 price_elem = item.select_one('.p-price strong') or \ item.select_one('.p-price i') or \ item.select_one('.price-box .price') or \ item.select_one('.j-price') price = price_elem.text.strip() if price_elem else "0.00" # 提取SKU/ISBN isbn = item.get('data-sku') or item.get('data-spu') or "N/A" # 详情页URL detail_elem = item.select_one('.p-img a') or \ item.select_one('.pic a') or \ item.select_one('.name a') detail_url = detail_elem['href'] if detail_elem and 'href' in detail_elem.attrs else "" if detail_url and not detail_url.startswith('http'): detail_url = 'https:' + detail_url books_data.append({ 'title': title, 'price': price, 'isbn': isbn, 'url': detail_url }) if (idx + 1) % 5 == 0: print(f"已处理 {idx + 1}/{len(items)} 个商品") except Exception as e: print(f"商品 {idx + 1} 提取错误: {str(e)[:50]}...") # 翻页处理 if page < max_pages: print("尝试翻页...") try: # 多种翻页按钮定位方式 next_btn = WebDriverWait(driver, 10).until( EC.element_to_be_clickable((By.CSS_SELECTOR, '.pn-next, .pn-next:not(.disabled)')) ) driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", next_btn) time.sleep(0.5) driver.execute_script("arguments[0].click();", next_btn) time.sleep(random.uniform(3, 5)) # 等待新页面加载 try: WebDriverWait(driver, 15).until( EC.presence_of_element_located( (By.CSS_SELECTOR, ".gl-item, .goods-list-v2 .item, .j-sku-item")) ) print("翻页成功") except TimeoutException: print("翻页后商品加载超时,继续尝试...") except (TimeoutException, NoSuchElementException): print("无法找到下一页按钮,尝试URL翻页...") current_url = driver.current_url if "page=" in current_url: new_page = page + 1 new_url = re.sub(r"page=\d+", f"page={new_page}", current_url) else: new_url = current_url + f"&page={new_page}" driver.get(new_url) time.sleep(random.uniform(3, 5)) print(f"已跳转到第 {new_page} 页") except Exception as e: print(f"爬取过程中发生严重错误: {str(e)}") # 保存当前页面供调试 with open("jd_error_page.html", "w", encoding="utf-8") as f: f.write(driver.page_source) print("错误页面已保存为 jd_error_page.html") finally: driver.quit() print(f"浏览器已关闭") print(f"爬取完成,共获取 {len(books_data)} 条图书数据") return books_data # 其他函数保持不变(analyze_comments_sentiment, main等) def analyze_comments_sentiment(comment_url): """评论情感分析""" if not comment_url: print("无有效URL,跳过评论分析") return [] print(f"开始分析评论: {comment_url}") driver = get_driver() sentiments = [] try: driver.get(comment_url) time.sleep(random.uniform(3, 5)) print("商品页面加载成功") # 切换到评论标签 - 更健壮的等待方式 print("切换到评论标签...") try: # 尝试点击评论标签 comment_tab = WebDriverWait(driver, 15).until( EC.element_to_be_clickable((By.CSS_SELECTOR, "[data-anchor='#comment']")) ) driver.execute_script("arguments[0].click();", comment_tab) time.sleep(random.uniform(2, 3)) print("评论标签切换成功") except: # 如果找不到元素,尝试直接访问评论URL if "#comment" not in driver.current_url: driver.get(comment_url + "#comment") print("直接访问评论页") time.sleep(random.uniform(3, 5)) # 提取评论内容 comments = [] print("开始提取评论...") for page_num in range(1, 4): # 最多尝试3页 print(f"处理评论页 {page_num}") try: # 等待评论加载 WebDriverWait(driver, 15).until( EC.presence_of_element_located((By.CSS_SELECTOR, ".comment-item")) ) time.sleep(random.uniform(1, 2)) soup = BeautifulSoup(driver.page_source, 'html.parser') comment_items = soup.select('.comment-item') print(f"本页找到 {len(comment_items)} 条评论") for idx, item in enumerate(comment_items): try: comment_elem = item.select_one('.comment-con') or item.select_one('.comment-content') if comment_elem: comment = comment_elem.get_text(strip=True) if 10 < len(comment) < 200: # 过滤过长/过短评论 comments.append(comment) except: continue # 检查是否达到所需评论数 if len(comments) >= 15: print(f"已收集足够评论({len(comments)}条)") break # 尝试翻页 try: next_btn = driver.find_element(By.CSS_SELECTOR, '.ui-pager-next') if "disabled" in next_btn.get_attribute("class"): print("已是最后一页") break print("翻到下一页评论") driver.execute_script("arguments[0].scrollIntoView();", next_btn) time.sleep(0.5) driver.execute_script("arguments[0].click();", next_btn) time.sleep(random.uniform(2, 4)) except Exception as e: print(f"评论翻页失败: {str(e)[:50]}...") break except Exception as e: print(f"评论页处理错误: {str(e)[:50]}...") break except Exception as e: print(f"评论爬取失败: {str(e)[:50]}...") finally: driver.quit() if not comments: print("获取到有效评论") return [] print(f"共获取 {len(comments)} 条评论,开始情感分析...") # 情感分析 sentiment_results = [] for i, comment in enumerate(comments[:15]): # 限制分析数量 try: # 控制请求频率 if i > 0 and i % 3 == 0: delay = random.uniform(0.5, 1.5) time.sleep(delay) # 确保文本长度符合要求 text = comment[:min(len(comment), 500)] result = client.sentimentClassify(text) if 'items' in result and len(result['items']) > 0: sentiment = result['items'][0]['sentiment'] sentiment_results.append(sentiment) # 打印进度 if (i + 1) % 5 == 0: print(f"已分析 {i + 1}/{len(comments[:15])} 条评论") except Exception as e: print(f"评论 {i + 1} 情感分析错误: {str(e)[:50]}...") print(f"情感分析完成,共分析 {len(sentiment_results)} 条评论") return sentiment_results def main(): print("=" * 50) print("京东图书爬虫与情感分析程序启动") print("=" * 50) # 爬取图书数据 print("\n开始爬取京东图书数据...") books = jd_book_crawler("Python编程", max_pages=2) # 保存结果到CSV if books: filename = 'jd_books.csv' with open(filename, 'w', newline='', encoding='utf-8-sig') as f: writer = csv.DictWriter(f, fieldnames=['title', 'price', 'isbn', 'url']) writer.writeheader() writer.writerows(books) print(f"\n数据已保存到 {filename}") # 对第一本书进行评论分析 if books: first_book = books[0] print(f"\n开始分析书籍评论: {first_book['title']}") sentiments = analyze_comments_sentiment(first_book['url']) if sentiments: # 统计情感分布 positive = sum(1 for s in sentiments if s == 2) negative = sum(1 for s in sentiments if s == 0) neutral = sum(1 for s in sentiments if s == 1) total = len(sentiments) print("\n情感分析结果:") print(f"样本数量: {total}") print(f"积极评价m: {positive} ({positive / total:.1%})") print(f"中性评价: {neutral} ({neutral / total:.1%})") print(f"消极评价: {negative} ({negative / total:.1%})") else: print("获取到有效评论数据") else: print("没有可分析的图书数据") else: print("爬取到图书数据") print("\n程序执行完成") if __name__ == "__main__": main() D:\python.exe C:/Users/32936/PycharmProjects/数据采集/作业.py ================================================== 京东图书爬虫与情感分析程序启动 ================================================== 开始爬取京东图书数据... 启动浏览器... 访问京东首页... 京东首页加载成功 搜索关键词: Python编程 已提交搜索 已进入搜索结果页 等待商品列表加载... 正在处理第 1 页 本页找到 30 个商品 已处理 5/30 个商品 已处理 10/30 个商品 已处理 15/30 个商品 已处理 20/30 个商品 已处理 25/30 个商品 已处理 30/30 个商品 尝试翻页... 翻页成功 正在处理第 2 页 本页找到 30 个商品 已处理 5/30 个商品 已处理 10/30 个商品 已处理 15/30 个商品 已处理 20/30 个商品 已处理 25/30 个商品 已处理 30/30 个商品 浏览器已关闭 爬取完成,共获取 60 条图书数据 数据已保存到 jd_books.csv 开始分析书籍评论: Python编程 从入门到实践 第3版(图灵出品) 开始分析评论: https://item.jd.com/11993134.html 商品页面加载成功 切换到评论标签... 直接访问评论页 开始提取评论... 处理评论页 1 评论页处理错误: Message: Stacktrace: GetHandleVerifier [0x0xc53b... 获取到有效评论 获取到有效评论数据 爬取某个电商网站的商品信息,并使用百度AI进行情感分析。 要求: 1.使用requests库获取网页内容。 2.使用BeautifulSoup库解析网页,提取商品名称、价格、评价等信息。 3.使用re库对提取的信息进行清洗。 4.使用Xpath库提取商品详情页面的链接,并获取商品详情。 5.使用selenium库模拟浏览器操作,爬取需要登录才能访问的商品评价信息。 6.使用baidu_aip库对商品评价进行情感分析,判断评价的正面或负面情绪。 代码怎么修改,可以先登录网站,在爬取吗,
最新发布
06-27
import requests from lxml import etree import csv import time import random import re import os from scrapy import Selector # 固定请求头配置(需替换为实际值) FIXED_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36 Edg/137.0.0.0' FIXED_COOKIE = 'll="118161"; bid=l5ki4SOlbBM; dbcl2="244638424:dLHXPIU8S0M"; push_noty_num=0; push_doumail_num=0; __utmv=30149280.24463; _pk_id.100001.4cf6=42b01014d5c31947.1748938983.; _vwo_uuid_v2=D9E78C6D9D4E71BBB6EC73B8583864961|9da3be87da4a6d3be6203809b085d4a9; __yadk_uid=2Zr6yzTnllQxMzDhrQB82h7doa8gM4Ku; ck=ILlj; ap_v=0,6.0; frodotk_db="dcae91cc1eae6af7960bb5645c0b40e5"; __utma=30149280.1697373246.1748938900.1750132184.1750207462.10; __utmc=30149280; __utmz=30149280.1750207462.10.7.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1750207469%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=1; __utma=223695111.238348316.1748938983.1750132184.1750207469.10; __utmb=223695111.0.10.1750207469; __utmc=223695111; __utmz=223695111.1750207469.10.8.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=30149280.6.10.1750207462' # 需从浏览器获取 # 基础URL和请求头 base_url = "https://movie.douban.com/subject/27181010/reviews" headers = { 'User-Agent': FIXED_USER_AGENT, 'Cookie': FIXED_COOKIE, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Referer': 'https://movie.douban.com/subject/27181010/', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-User': '?1', } # 创建输出目录 os.makedirs('douban_data', exist_ok=True) # 创建CSV文件并写入表头 csv_file = open('douban_data/douban_reviews_fixed(4).csv', 'w', newline='', encoding='utf-8-sig') writer = csv.writer(csv_file) writer.writerow(['昵称', '星级评分', '评论时间', '文本评论', '点赞数', '负赞数', '回应数', '页面位置']) def extract_content(element): """提取评论内容,处理展开情况""" # 尝试提取短评内容 short_content = element.xpath('.//div[contains(@class, "short-content")]/text()') if short_content: return ''.join(short_content).strip() # 尝试提取完整评论内容 full_content = element.xpath('.//div[@class="review-content clearfix"]/text()') if full_content: return ''.join(full_content).strip() # 尝试提取折叠内容 folded_content = element.xpath('.//div[@class="folded"]/text()') if folded_content: return ''.join(folded_content).strip() return "无内容" # 爬取多页数据 for page in range(0, 125): # 构造URL参数 params = { 'start': page * 20, 'sort': 'new_score', 'status': 'P' } try: print(f"开始爬取第 {page+1} 页...") # 发送请求(禁止重定向以检测验证) response = requests.get( url=base_url, params=params, headers=headers, timeout=15, allow_redirects=False # 禁止重定向以检测验证 ) # 检查重定向状态码(302表示需要验证) if response.status_code == 302: location = response.headers.get('Location', '知位置') print(f"⚠️ 第 {page+1} 页触发验证,重定向至: {location}") # 保存重定向页面供分析 with open(f'douban_data/redirect_page_{page}.html', 'w', encoding='utf-8') as f: f.write(response.text) continue response.encoding = 'utf-8' if response.status_code == 200: # 解析HTML html = etree.HTML(response.text) # 检查是否有验证码提示 captcha = html.xpath('//input[@name="captcha-id"]') if captcha: print(f"⚠️ 第 {page+1} 页需要验证码,跳过") # 保存验证码页面供分析 with open(f'douban_data/captcha_page_{page}.html', 'w', encoding='utf-8') as f: f.write(response.text) continue # 检查页面是否包含评论容器(使用更灵活的选择器) review_container = html.xpath('//div[@class="review-list"]') if not review_container: # 尝试备用选择器 review_container = html.xpath('//div[contains(@id, "content")]//div[contains(@class, "review")]') if not review_container: # 保存异常页面用于分析 with open(f'douban_data/error_page_{page}.html', 'w', encoding='utf-8') as f: f.write(response.text) print(f"❌ 第 {page+1} 页无评论容器,已保存页面供分析") continue # 提取评论项(更新后的选择器) comments = html.xpath('//div[contains(@class, "review-item")]') # 备用选择器:尝试抓取评论项 if not comments: comments = html.xpath('//div[contains(@class, "main") and contains(@class, "review-item")]') if not comments: comments = html.xpath('//div[@class="review-list"]/div[contains(@class, "review")]') if not comments: print(f"❌ 第 {page+1} 页找到0条评论,可能触发反爬") # 检查反爬提示 anti_spider = html.xpath('//div[contains(text(), "检测到异常请求")]') if anti_spider: print("⚠️ 检测到反爬提示,请更换Cookie或IP") # 保存页面供分析 with open(f'douban_data/antispider_page_{page}.html', 'w', encoding='utf-8') as f: f.write(response.text) continue print(f"✅ 第 {page+1} 页找到 {len(comments)} 条评论") for idx, comment in enumerate(comments): try: # 提取昵称 username = comment.xpath('.//a[contains(@class, "name")]/text()') if not username: username = comment.xpath('.//span[@class="author"]/a/text()') username = username[0].strip() if username else "无昵称" # 提取星级评分 rating = comment.xpath('.//span[contains(@class, "rating")]/@title') if not rating: rating = comment.xpath('.//span[contains(@class, "main-title-rating")]/@title') rating = rating[0] if rating else "无评分" # 提取评论时间 comment_time = comment.xpath('.//span[contains(@class, "main-meta")]/text()') # 提取文本评论 content = extract_content(comment) # 提取点赞数 selector_list = comment.xpath('.//*[contains(@id, "useful_count")]/text()') if selector_list: # 检查列表是否非空 raw_text = selector_list[0] else: raw_text = "" # 空结果处理 match = re.search(r'\d+', raw_text) useful_count = int(match.group()) if match else 0 # 提供默认值 # 提取负赞数 selector_list = comment.xpath('.//*[contains(@id, "useless_count")]/text()') if selector_list: # 检查列表是否非空 raw_text = selector_list[0] else: raw_text = "" # 空结果处理 match = re.search(r'\d+', raw_text) useless_count = int(match.group()) if match else 0 # 提供默认值 # 提取回应数 reply_count = comment.xpath('.//*[contains(@id, "reply")]/text()') # 写入CSV writer.writerow([ username, rating, comment_time, content, useful_count, useless_count, reply_count, f"第{page+1}页第{idx+1}条" ]) except Exception as e: print(f"⚠️ 处理评论时出错: {e}") continue else: print(f"❌ 请求失败,状态码: {response.status_code}") except Exception as e: print(f"❌ 请求异常: {e}") # 随机延迟,避免频繁请求 delay = random.uniform(3, 8) print(f"⏳ 等待 {delay:.2f} 秒后继续...") time.sleep(delay) csv_file.close() print("✅ 爬取完成!数据已保存到 douban_data/douban_reviews_fixed(4).csv"),此代码在爬取文本评论时无法爬取展开的评论内容,帮我修改代码,使得可以爬取相对应的完整评论内容,此外,提取回应数的输出为[],其Xpath样式为//*[@id="16724545"]/div/div[3]/a[3],帮我修改,要修改后的所有完整代码
06-19
评论 1
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值