edge驱动爬虫
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import time
import requests
import json
import random
import pandas as pd
import numpy as np
import re
from pyquery import PyQuery as pq
import jieba
def get_one_page(url):#输入网页
headers = {
'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0',
'Content-Type': 'application/x-www-form-urlencoded',
'Connection' : 'Keep-Alive',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
}
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
return None
def get_city(text,pattern):#文本和正则表达式
patterns = re.compile('.*?(\w{7})[?。,]',re.S)
text1 = re.findall(patterns,text)
return text
def get_text(html,pattern):#网页代码和规则
doc = pq(html)
text1 = doc(pattern).text()
return text1
def get_word_s(texts):
cut = jieba.lcut_for_search(texts)
return cut[1]
def get_word_j(texts):#
cut = jieba.lcut(texts)
return cut[1]
def get_word_q(texts):#
cut = jieba.lcut(texts,cut_all=True)
return cut[1]
def open_edge_getsuoece(url):
browser = webdriver.Edge(executable_path = "C:\\Users\\Public\\Documents\\Python Scripts\\msedgedriver.exe")
browser.get(url)
#input_one = browser.find_element_by_css_selector()
#input_one.send_keys('内容')
#time.sleep(1)
#input_one.clear()
#bottom = browser.find_element_by_css_selector()
#bottom.click()
#browser.execute_script('脚本')
wait = WebDriverWait(browser,15)
text1 = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.col-3')))
text = browser.page_source
browser.close()
print('open_edge_getsuoece ok')
return text
def get_dz(text):
pattern = re.compile('.&?(【.*?】).*?',re.S)
text1 = re.findall(pattern,text)
text1 =text1[::-1]
print(text1)
list1 =[]
for i in text1:
l = 0
l = l+1
te = get_word_j(str(i))
if te in list1:
continue
list1.append(te)
return list1
url = "https://www.ickd.cn/ems.html#no=1101504223119"
html = open_edge_getsuoece(url)
pation = "td.col-3"
text = get_text(html,pation)
#with open("C:\\Users\\西木康\\Desktop\\爬虫1\\110.txt","r",encoding='utf-8')as f:
#html = f.read()
text1 = get_dz(text)
print(text1)