爬虫记录续

edge驱动爬虫

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import time
import requests
import json
import random
import pandas as pd
import numpy as np
import re
from pyquery import PyQuery as pq
import jieba

def get_one_page(url):#输入网页
    headers = {
    'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0',
        'Content-Type': 'application/x-www-form-urlencoded',
        'Connection' : 'Keep-Alive',
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
    }
    response = requests.get(url,headers=headers)
    if response.status_code == 200:
        return response.text
    return None

def get_city(text,pattern):#文本和正则表达式
    patterns = re.compile('.*?(\w{7})[?。,]',re.S)
    text1 = re.findall(patterns,text)                      
    return text
                          
def get_text(html,pattern):#网页代码和规则
    doc = pq(html)
    text1 = doc(pattern).text()                      
    return text1

def get_word_s(texts):
    cut = jieba.lcut_for_search(texts)
    return cut[1]
def get_word_j(texts):#
    cut = jieba.lcut(texts)
    return cut[1]
def get_word_q(texts):#
    cut = jieba.lcut(texts,cut_all=True)
    return cut[1]

def open_edge_getsuoece(url):
    browser = webdriver.Edge(executable_path = "C:\\Users\\Public\\Documents\\Python Scripts\\msedgedriver.exe")
    browser.get(url)
    #input_one = browser.find_element_by_css_selector()
    #input_one.send_keys('内容')
    #time.sleep(1)
    #input_one.clear()
    #bottom = browser.find_element_by_css_selector()
    #bottom.click()
    #browser.execute_script('脚本')
    wait = WebDriverWait(browser,15)
    text1 = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.col-3')))
    text = browser.page_source
    browser.close()
    print('open_edge_getsuoece ok')
    return text
def get_dz(text):
    pattern = re.compile('.&?(【.*?】).*?',re.S)
    text1 = re.findall(pattern,text)
    text1 =text1[::-1]
    print(text1)
    list1 =[]
    for i in text1:
        l = 0
        l = l+1
        te = get_word_j(str(i))
        if te in list1:
            continue
        list1.append(te)
    return list1

url = "https://www.ickd.cn/ems.html#no=1101504223119"
html = open_edge_getsuoece(url)
pation = "td.col-3"
text = get_text(html,pation)
#with open("C:\\Users\\西木康\\Desktop\\爬虫1\\110.txt","r",encoding='utf-8')as f:
    #html = f.read()
text1 = get_dz(text)
print(text1)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值