爬虫(一)爬虫的20个小案例

1.百度首页

1.导入requests模块

import requests

2.获取百度url

url = "https://www.baidu.com/"

3.请求方式:request Method:GET

3.1做伪装,添加headers:

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36','Cookie':'BIDUPSID=F6BBCD59FE2A646812DB8DAE641A0BE5; PSTM=1573713375; BAIDUID=F6BBCD59FE2A6468D0329C1E2F60212F:FG=1; BD_UPN=12314353; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=1452_21098_29568_29221_26350; delPer=0; BD_CK_SAM=1; PSINO=2; H_PS_645EC=50d5uY51q2qJG%2BVlK7rlPmCgY73TcN9qKRz4sPKuBII1GIkIx4QkChitGd4; BDSVRTM=209'}

4.发送请求,返回响应

response = requests.get(url=url,headers=headers)

5.查看响应内容

(1)response.text:返回文本信息 

(2)response.content  返回字节流信息   .content.decode('utf-8')  解码

6.写入本地文件

with open("baidu.html",'w',encoding="utf-8") as fp:

         fp.write(response.content.decode("utf-8"))

2.百度贴吧

import requests
kw = input("请输入你要访问的贴吧名称:")
url = 'https://tieba.baidu.com/f?ie=utf-8&kw={}&fr=search&pn='.format(kw)
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',}

for page in range(10):
    full_url = url+str(page*50)

    #发起请求
    response = requests.get(url=full_url,headers=headers).content.decode("utf-8")

    #保存
    with open("tieba{}.html".format(page+1),'w',encoding='utf-8')as fb:
        fb.write(response)

===============补充内容:::

#get请求参数:
params = {
    'ie': 'utf-8',
    'kw': 'python',
    'fr': 'search',
    'red_tag': 'y2156030250'
}
url = 'http://tieba.baidu.com/f?'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',}
response = requests.request('get',url=url,params=params,headers=headers).content.decode('utf-8')
print(response)

3.百度翻译

import requests
import json

def fanyi(kw):
    # 1.url
    url = 'https://fanyi.baidu.com/sug'

    # 请求方式 POST

    # 2.参数:
    data = {'kw': kw}

    # 3.请求
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',}
    response = requests.post(url=url,data=data,headers=headers).content.decode('utf-8')
    response = json.loads(response)
    for i in response['data']:
        word = i["k"]
        translate = i["v"]
        print(word+":"+translate+'\n')
        with open('{}.txt'.format(kw),'a',encoding='utf-8') as fp:
            fp.write(word+":"+translate+'\n')

if __name__ == '__main__':
    while True:
        kw = input("请输入你要翻译的内容======>")
        fanyi(kw)

4.人人网

import requests

# 1.url
url = 'http://www.renren.com/PLogin.do'

# method:POST
# 2.参数:
data = {
    'email':'18811176939',
    'password':'123457'
}

# 3.请求
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36', }
response = requests.post(url=url,data=data,headers=headers).content.decode('utf-8')
with open('renren.html','w',encoding='utf-8') as fp:
    fp.write(response)

5.代理设置

import requests
import random
#建立代理池
proxies = [
    {'http':'124.113.217.5:9999','https':''},
    {'http':'183.164.239.177','https':''}
]
#随机选择代理ip
prox = random.choice(proxies)
print(prox)
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',}

response = requests.get('https://www.kuaidaili.com/free/inha/',proxies=prox,headers=headers).content.decode('utf-8')
print(response)

6.高德地图

import requests
import json
def weatherlist(url1,url2,headers,proxies):
    response = requests.get(url=url1, headers=headers, proxies=proxies).content.decode('utf-8')
    response = json.loads(response)
    for i in response["data"]["cityByLetter"].values():
        for j in i:
            adcode = j["adcode"]
            name = j["name"]
            full_url = url2+adcode
            response = requests.get(url=full_url, headers=headers, proxies=proxies).content.decode('utf-8')
            response = json.loads(response)
            print(response)
            try:
                if response["data"]["data"]:
                    for weather in response["data"]["data"]:
                        for weather in weather['forecast_data']:
                            weather_name = weather['weather_name']
                            temp_min = weather['min_temp']
                            temp_max = weather['max_temp']
                            with open('weather_list.txt', 'a', encoding='utf-8') as fp:
                                fp.write("城市:"+name+ " 天气: "+weather_name+" 最高气温: "+ temp_max
                                    +" 最低气温: "+temp_min+'\n')
            except:
                print('空')
if __name__ == '__main__':
    url1 = 'https://www.amap.com/service/cityList'
    url2 = 'https://www.amap.com/service/weather?adcode='
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36','Cookie':'BIDUPSID=F6BBCD59FE2A646812DB8DAE641A0BE5; PSTM=1573713375; BAIDUID=F6BBCD59FE2A6468D0329C1E2F60212F:FG=1; BD_UPN=12314353; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=1452_21098_29568_29221_26350; delPer=0; BD_CK_SAM=1; PSINO=2; H_PS_645EC=50d5uY51q2qJG%2BVlK7rlPmCgY73TcN9qKRz4sPKuBII1GIkIx4QkChitGd4; BDSVRTM=209'}
    proxies = {'http':'124.113.217.5:9999','https':''}
    weatherlist(url1,url2,headers,proxies)

7.金山词霸

import requests
import json

def fanyi(url,headers,proxies,data):
    response = requests.post(url=url,headers=headers,proxies=proxies,data=data).content.decode('utf-8')
    response = json.loads(response)
    print(response)
if __name__ == '__main__':
    url = 'http://fy.iciba.com/ajax.php?a=fy'
    w = input("请输入你要翻译的单词=======>")
    data = {
        'f': 'auto',
        't': 'auto',
        'w': w
    }

    proxies = {'http': '125.110.90.93:9000', 'https': ''}
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36','Cookie': 'UM_distinctid=16e9cecbfbb12e-0701242418f796-54133310-100200-16e9cecbfbc135; CNZZDATA1256573702=520451556-1574586989-%7C1574586989; __gads=ID=80bd4d4328d6d249:T=1574590731:S=ALNI_MaVD1f5SOmn3mHzHr4qp3LOGH6REA','a': 'fy'}

    fanyi(url,headers,proxies,data)

8.人人网

#第一种方法==========


import requests
url = 'http://www.renren.com/PLogin.do'
data = {
    'email':'**********',
    'password':'*******'
}
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36', }
response = requests.post(url=url,headers=headers,data=data).content.decode("utf-8")

url2 = 'http://www.renren.com/964508169/newsfeed/photo'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',
'Cookie': 'anonymid=k39t0b6ygzu4ll; depovince=GW; _r01_=1; jebe_key=a4314260-8a4a-4342-8995-c9e648a7a1b8%7C9a7ca8d077b15761409e1298ab30421f%7C1574406709891%7C1%7C1574406774042; JSESSIONID=abcn0y6W3hiuoaYCyCG6w; ick_login=0e033cf7-03c1-4f2d-87c4-3792223ab81b; first_login_flag=1; ln_uact=18811176939; ln_hurl=http://hdn.xnimg.cn/photos/hdn421/20191125/1040/main_v6a1_0aac000038d2195a.jpg; wp_fold=0; jebecookies=a12f9f8a-7b0f-413a-b5b9-17fc13c3e2e5|||||; _de=E1BF45DCCBECABDD1B5D679B401790AD; p=4eca4158940cc418e6f44861141b88f89; t=f0efd7f6c081ed1e0658bfbb470710c59; societyguester=f0efd7f6c081ed1e0658bfbb470710c59; id=964508169; xnsid=5f89fe84; ver=7.0; loginfrom=null; jebe_key=a4314260-8a4a-4342-8995-c9e648a7a1b8%7Cf096e2c5efb5a6aed844642618d7e763%7C1574650138110%7C1%7C1574650203722'}
response2 = requests.get(url=url2,headers=headers).content.decode('utf-8')
with open('ren.html','w',encoding='utf-8') as fp:
    fp.write(response2)




#第二种方法

# (1)创建session对象
sess = requests.session()
# (2)模拟登陆,记录客户端身份信息
url1 = 'http://www.renren.com/PLogin.do'
data = {
    'email':'18811176939',
    'password':'123457'
}
sess.post(url = url1,data=data)

# (3)访问首页信息
url2 = 'http://www.renren.com/964508169/newsfeed/photo'
resonse = sess.get(url= url2).content.decode('utf-8')

with open ('renren.html','w',encoding='utf-8') as fp:
    fp.write(resonse)

9.豆瓣电影

import requests
import json

def douban(url,headers,proxies):
    content = requests.get(url=url, headers=headers, proxies=proxies).content.decode('utf-8')
    movie_data = json.loads(content)
    data_list = []
    for movie in movie_data:
        data_dict = {}
        data_dict['title'] = movie.get('title')
        data_dict['regions'] = movie.get('regions')
        data_dict['types'] = movie.get('types')
        data_dict['url'] = movie.get('url')
        data_dict['actors'] = movie.get('actors')
        data_list.append(data_dict)
    json_data = json.dumps(data_list,ensure_ascii=False)
    with open('moviedata.json','w',encoding='utf-8') as fp:
        fp.write(json_data)

if __name__ == '__main__':
    url = 'https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=0&limit=20'
    proxies = {'http': '125.110.90.93:9000', 'https': ''}
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36', }
    douban(url,headers,proxies)

10.猫眼电影

import requests
import json
import re

def maoyan(url,headers,proxies):
    response = requests.get(url=url,headers=headers,proxies=proxies).content.decode('utf-8')
    pattern = re.compile(r'<dd>[\d\D]*?</dd>')
    movie_list = pattern.findall(response)
    movielist  = []
    for movie in movie_list:
        movie_dict = {}
        #名字
        pattern = re.compile(r'}">(.*?)</a></p>')
        title = pattern.findall(movie)[0]
        movie_dict["title"] = title
        print(title)

        #排名
        pattern = re.compile(r'<i class="board-index board-index-(\d+)">')
        rank = pattern.findall(movie)[0]
        movie_dict["rank"] = rank
        print(rank)

        #评分
        pattern = re.compile(r'<p class="score"><i class="integer">(\d\.)</i><i class="fraction">(\d)</i></p>')
        score = pattern.findall(movie)
        score = score[0][0]+score[0][1]
        movie_dict["score"] = score
        print(score)

        #图片
        pattern = re.compile(r'<img data-src="(.*?)"')
        src = pattern.findall(movie)[0]
        movie_dict["src"] = src
        print(src)

        movielist.append(movie_dict)

    data = json.dumps(movielist,ensure_ascii=False)
    with open("maoyan.json",'w',encoding='utf-8') as fp:
        fp.write(data)

if __name__ == '__main__':
    url = 'https://maoyan.com/board'
    proxies = {'http': '125.110.90.93:9000', 'https': ''}
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36', }
    maoyan(url,headers,proxies)

11.动物世界

# 需求:
# 1、动物网站
# http://www.iltaw.com/animal/all动物世界
# 各种动物图片,中文名,英文名,学名,简介,食性,繁殖,习性,分
# 布,外形特征,生态习性,生长繁殖,地理分布地区,

import requests
import json
import re

#判断是否为空
def panduan(obj):
    if obj:
        return obj[0]
    else:
        return ''

def dongwu(url,headers,proxies):
    content = requests.get(url=url,headers=headers,proxies=proxies).content.decode('utf-8')
    pattern = re.compile(r'<li class="clearfix">[\d\D]*?</li>')
    animal_list = pattern.findall(content)
    animallist = []
    for animal in animal_list:
        animal_dict = {}
        #详情页链接
        pattern = re.compile('<h3><a href="(.*?)" target="_blank">')
        detail = pattern.findall(animal)[0]
        animal_dict["detail"] = detail
        animallist.append(animal_dict)
    #循环列表,取链接
    animaldetails = []
    for animal_href in animallist:
        animaldict = {}
        href = animal_href.get("detail")
        response = requests.get(url=href,headers=headers,proxies=proxies).content.decode('utf-8')
        #图片
        pattern = re.compile('<img class="loading-v1" data-url="(.*?)"')
        img = pattern.findall(response)
        animaldict["图片"] = panduan(img)
        #中文名
        pattern = re.compile('中文名:(.*?);<br />')
        name = pattern.findall(response)
        animaldict["中文名"] = panduan(name)
        #英文名
        pattern = re.compile('<br />英文名:(.*?);<br />')
        engname = pattern.findall(response)
        animaldict["英文名"] = panduan(engname)
        #学名
        pattern = re.compile('<br />学名:<em>(.*?)</em>')
        xuename = pattern.findall(response)
        animaldict["学名"] = panduan(xuename)
        #简介
        pattern = re.compile('。<br />(.*?)</div>')
        jianjie = pattern.findall(response)
        animaldict["简介"] =panduan(jianjie)
        #食性
        pattern = re.compile('食性: </em><span>(.*?)</span>')
        food = pattern.findall(response)
        animaldict["食性"] = panduan(food)
        #繁殖
        pattern = re.compile('繁殖: </em><span>(.*?)</span>')
        fanzhi = pattern.findall(response)
        animaldict["繁殖"] = panduan(fanzhi)
        #习性
        pattern = re.compile('习性: </em><span>(.*?)</span>')
        xixing = pattern.findall(response)
        animaldict["习性"] = panduan(xixing)
        #分布
        pattern = re.compile('分布: </em><span>(.*?)</span>')
        fenbu = pattern.findall(response)
        animaldict["分布"] = panduan(fenbu)
        #外形特征
        pattern = re.compile('外形特征.*?<div class="description">(.*?)</div>',re.S)
        waixing = pattern.findall(response)
        animaldict["外形特征"] = panduan(waixing)
        #生态习性
        pattern = re.compile('生态习性.*?<div class="description">(.*?)</div>',re.S)
        shengtai = pattern.findall(response)
        animaldict["生态习性"] = panduan(shengtai)
        #生长繁殖
        pattern = re.compile('生长繁殖.*?<div class="description">(.*?)</div>', re.S)
        shengzhang = pattern.findall(response)
        animaldict["生长繁殖"] = panduan(shengzhang)
        #地理分布
        pattern = re.compile('地理分布.*?<div class="description">(.*?)</div>', re.S)
        dili = pattern.findall(response)
        animaldict["地理分布"] = panduan(dili)
        #存入列表中
        animaldetails.append(animaldict)

    #转化成json格式,储存成json文件
    data = json.dumps(animaldetails,ensure_ascii=False)
    with open('animal.json','w',encoding='utf-8') as fp:
            fp.write(data)

if __name__ == '__main__':
    url = 'http://www.iltaw.com/animal/all'
    proxies = {'http': '', 'https': '117.28.96.160:9999'}
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',
    'Cookie': '__cfduid=d45abc9c47821980f926a234bc1d141771574671242; PHPSESSID=1111adcbe2de581fc10d8912a9100db8; Hm_lvt_2b65b835db5cae63ad487fd29631b1c7=1574671306; Hm_lpvt_2b65b835db5cae63ad487fd29631b1c7=1574671306; UM_distinctid=16ea1b94f0f142-0edd2ae99214fd-54133310-100200-16ea1b94f108d2; CNZZDATA1000267376=387552791-1574669922-%7C1574669922'}
    dongwu(url,headers,proxies)

12.股吧

import requests
import json
import re

def guba(url,headers,proxies):
    response = requests.get(url=url,headers=headers,proxies=proxies).content.decode('utf-8')
    pattern = re.compile(r'<ul class="newlist" tracker-eventcode="gb_xgbsy_ lbqy_rmlbdj">[\w\W]*?</ul>')
    list = pattern.findall(response)[0]
    li_pattern = re.compile(r'<li>([\d\D]*?)</li>')
    li_list = li_pattern.findall(list)
    gubalist = []

    for li in li_list:
        dict = {}
        pattern = re.compile(r'<cite>(.*?)</cite>',re.S)
        data = pattern.findall(li)
        print(data[0].strip())
        print(data[1].strip())
        dict["阅读量"] = data[0].strip()
        dict["评论数"] = data[1].strip()
        #名称
        pattern = re.compile(r'<span class="sub">.* class="balink">(.*?)</a>]')
        baname = pattern.findall(li)
        print(baname[0])
        dict["吧名称"] = baname[0]
        #标题
        pattern = re.compile(r'<a href=".* title="(.*?)"')
        title = pattern.findall(li)
        print(title[0])
        dict["标题"] = title[0]
        #作者
        pattern = re.compile(r'target="_blank"><font>(.*?)</font></a>')
        author = pattern.findall(li)
        print(author[0])
        dict["作者"] = author[0]
        #更新时间
        pattern = re.compile(r'</cite><cite class="last">(.*?)</cite>')
        time = pattern.findall(li)
        print(time[0])
        dict["更新时间"] = time[0]
        #详情url
        pattern = re.compile(r'<a href="(/news,.*?)"')
        url = pattern.findall(li)[0]
        full_url = 'http://guba.eastmoney.com'+url

        #获取详情页面的内容
        response = requests.get(url=full_url,headers=headers,proxies=proxies).content.decode('utf-8')
        pattren = re.compile(r'"post_content".*"([\d\D]*?),"post_publish_time"')
        content = pattren.findall(response)[0]
        pattern = re.compile(
            r'[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5]+')
        text = pattern.findall(content)
        #返回文本格式
        content_text = ','.join(text)
        # dict["详情内容"] = content_text
        gubalist.append(dict)
        # i+=1
        # print(i, "====================")
    datajson = json.dumps(gubalist,ensure_ascii=False)
    with open ('guba.json','w',encoding='utf-8') as fp:
        fp.write(datajson)


if __name__ == '__main__':
    i = 0
    for i in range(1,30):
        url = 'http://guba.eastmoney.com/default,99_{}.html'.format(i)
        proxies = {'http': '', 'https': '58.253.157.232:9999'}
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',
            'Cookie': '__cfduid=d45abc9c47821980f926a234bc1d141771574671242; PHPSESSID=1111adcbe2de581fc10d8912a9100db8; Hm_lvt_2b65b835db5cae63ad487fd29631b1c7=1574671306; Hm_lpvt_2b65b835db5cae63ad487fd29631b1c7=1574671306; UM_distinctid=16ea1b94f0f142-0edd2ae99214fd-54133310-100200-16ea1b94f108d2; CNZZDATA1000267376=387552791-1574669922-%7C1574669922'}
        guba(url,headers,proxies)

13.大药房


import requests
import re
import json

def yaofang(url,headers,proxies):
    response = requests.get(url=url,headers=headers,proxies=proxies).content.decode( 'gbk')
    pattern = re.compile(r'<li id="producteg_.*>[\w\Ww]*?</li>')
    li_list = pattern.findall(response)
    #循环li标签
    href_list = []
    for li in li_list:
        pattern = re.compile(r'href="(.*?)"')
        href = pattern.findall(li)
        if href:
            full_href = 'https:'+href[0]
            href_list.append(full_href)

    #循环详情页
    detail_list = []
    for detail_href in href_list:
        dict = {}
        response = requests.get(url=detail_href,headers=headers,proxies=proxies).content.decode('gbk')
        #店名
        pattern = re.compile(r'class="right_property".*<h3>(.*?)</h3>',re.S)
        name = pattern.findall(response)
        if name:
            dict["店名"] = name[0]
        else:
            dict["店名"] = "自营"
        # detail_list.append(dict)
        #详情
        pattern = re.compile(r'<div class="goods_intro">[\w\W]*?</div>')
        table_list = pattern.findall(response)
        for detail in table_list:
            dict2 = {}
            # print(detail)
            pattern = re.compile(r'<th>商品名称:</th>.*?<td colspan="3">(.*?)</td>',re.S)
            name = pattern.findall(detail)[0]
            dict["商品名称"] = name
            #品牌
            pattern = re.compile(r'品  牌:</th>.*?<td>(.*?)</td>',re.S)
            pinpai = pattern.findall(detail)[0]
            dict["品牌"] = pinpai
            #规格
            pattern = re.compile(r'规  格:</th>.*?<td>(.*?)</td>', re.S)
            guige = pattern.findall(detail)[0]
            dict["规格"] = guige
            # 规格
            pattern = re.compile(r'重  量:</th>.*?<td>(.*?)</td>', re.S)
            zhongliang = pattern.findall(detail)[0]
            dict["重量"] = zhongliang
            # 生产厂商
            pattern = re.compile(r'生产厂商:</th>.*?<td>(.*?)</td>', re.S)
            shengchan = pattern.findall(detail)[0]
            dict["生产厂商"] = shengchan
            # # 批准文号
            # pattern = re.compile(r'批准文号:</th>.*?<td>(.*?)</td>', re.S)
            # pizhun = pattern.findall(detail)[0].split()
            # # dict2["批准文号"] = pizhun
            # # print(pizhun)
            detail_list.append(dict)
    data = json.dumps(detail_list,ensure_ascii=False)
    with open('yaofan.json','w',encoding='utf-8') as fp:
        fp.write(data)


if __name__ == '__main__':
    for i in range(1,6):
        url = 'https://www.111.com.cn/categories/953710?tp=10-{}'.format(i)
        proxies = {'http': '117.95.192.4:9999', 'https': ''}
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',
        'Cookie': '__cfduid=d45abc9c47821980f926a234bc1d141771574671242; PHPSESSID=1111adcbe2de581fc10d8912a9100db8; Hm_lvt_2b65b835db5cae63ad487fd29631b1c7=1574671306; Hm_lpvt_2b65b835db5cae63ad487fd29631b1c7=1574671306; UM_distinctid=16ea1b94f0f142-0edd2ae99214fd-54133310-100200-16ea1b94f108d2; CNZZDATA1000267376=387552791-1574669922-%7C1574669922'}
        yaofang(url,headers,proxies)

14.xpath 猫眼电影

from lxml import etree
import requests
import json

def maoyan(url,headers,proxies):
    response = requests.get(url=url,headers=headers,proxies=proxies).content.decode('utf8')
    tree = etree.HTML(response)
    dd_list = tree.xpath('//dd')
    list = []
    for dd in dd_list:
        dict = {}
        #排名
        paiming = dd.xpath('./i/text()')[0]
        dict["排名"] = paiming
        #名字
        name = dd.xpath('./a/@title')[0]
        dict["名字"] = name
        #演员
        star = dd.xpath('.//p[@class="star"]/text()')[0].strip().replace('主演:','')
        dict["主演"] = star
        #上映时间
        time = dd.xpath('.//p[@class="releasetime"]/text()')[0].replace('上映时间:','')
        dict["上映时间"] = time
        #图片链接
        img = dd.xpath('.//img[@class="board-img"]/@data-src')[0]
        dict['图片链接'] = img
        #详情链接
        detail = dd.xpath('./a/@href')[0]
        dict["详情链接"] = detail
        list.append(dict)
    data = json.dumps(list,ensure_ascii=False)
    with open('maoyan.json','w',encoding='utf-8') as fp:
        fp.write(data)



if __name__ == '__main__':
    url = 'https://maoyan.com/board'
    proxies = {'http': '125.110.90.93:9000', 'https': ''}
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36', }
    maoyan(url,headers,proxies)

15.链家

"""
链家网站爬取
一、案例需求:
链家:https://bj.fang.lianjia.com/loupan/
1、获取所有的城市的拼音
2、根据拼音去拼接url,获取所有的数据。
3、列表页:图片,楼盘名称,均价,建筑面积,区域,商圈
详情页:户型(["8室5厅8卫", "4室2厅3卫", "5室2厅2卫"]),朝向,图片(列表),用户点评(选爬)

项目地址:通燕高速耿庄桥北出口中化石油对面
售楼处地址:通燕高速北侧耿庄桥出口北500米潞苑南大街(接待时间 9:00 - 18:00)
开发商:石榴置业集团股份有限公司
物业公司:浙江省绿城物业服务有限公司
最新开盘:2018.01.20 转成时间戳
物业类型:别墅
交房时间:2016年05月10日 转成2016-05-10
容积率:1.20
产权年限:70年
绿化率:30%
规划户数:173
物业费用:6.91~7.13元/m2/月(不要后面的:元/m2/月)
车位情况:地下车位数584(只要数字)
供暖方式:集中供暖
供水方式:民水
供电方式:民电
建筑类型:板楼
嫌恶设施:暂无
占地面积:39,600㎡(不要单位)
建筑面积:40,000㎡(不要单位)

"""
import pymysql
class PyMysql:
    def __init__(self):
        ##1.链接数据库
        self.db = pymysql.connect(user='root',host='localhost',password='123456',database='lianjia')
        ##2.创建游标
        self.c = self.db.cursor()
    def sql_caozuo(self,sql):
        ##3.执行sql语句
        self.c.execute(sql)
        ##查看回执
        print(self.c.fetchall())
    ##__del__魔术方法在主程序运行结束后,开始运行垃圾回收机制时候运行.
    def __del__(self):
        self.db.commit()
        self.c.close()
        self.db.close()

import requests
from lxml import etree
import json
import re
import math

def panduan(obj):
    if obj:
        return obj[0]
    else:
        return 'None'


def pinyin(url2,headers,proxies):
    response = requests.get(url=url2,headers=headers,proxies=proxies).text
    response = str(response)
    partten = re.compile(r'"short":"(.*?)"')
    pinyin_list= partten.findall(response)

    for pinyin in pinyin_list:
        full_url = 'https://{}.fang.lianjia.com/loupan/'.format(pinyin)
        try:
            response1 = requests.get(url=full_url,headers=headers,proxiaes=proxies).content.decode('utf-8')
            tree = etree.HTML(response1)
        except:''
        # 提取页数:
        total_page = tree.xpath('//div[@class="page-box"]/@data-total-count')
        for page in range(1, math.ceil(int(total_page[0]) / 10) + 1):
            url = full_url+'pg{}'.format(page)
            # print(url)
            response = requests.get(url=url,headers=headers,proxies=proxies).content.decode('utf-8')
            tree = etree.HTML(response)
            li_list = tree.xpath('//ul[@class="resblock-list-wrapper"]/li')
            lianjia_list=[]
            for li in li_list:
                lianjia_dict = {}
                # (1)图片:
                src = li.xpath('.//img[@class="lj-lazy"]/@data-original')
                print(panduan(src))
                tupian = panduan(src)
                lianjia_dict["图片链接"] = tupian
                # (2) 楼盘名称,
                title = li.xpath('./a/@title')
                print(panduan(title))
                loupan = panduan(title)
                lianjia_dict["楼盘名称"] = loupan
                # (3) # 均价,
                price = li.xpath('.//span[@class="number"]/text()|.//span[@class="desc"]/text()')
                price_new = ''.join(price).replace('\xa0', '')
                print(price_new)
                lianjia_dict["均价"] = price_new


                # (4)建筑面积,
                area_list = li.xpath('.//div[@class="resblock-area"]/span/text()')
                print(panduan(area_list))
                jianzhu = panduan(area_list)
                lianjia_dict["建筑面积"] = jianzhu
                #(5) 区域
                add = li.xpath('.//div[@class="resblock-location"]/span/text()')
                quyu = panduan(add)
                print(panduan(add))
                lianjia_dict["区域"] = quyu
                #(6)商圈
                shop = li.xpath('.//div[@class="resblock-location"]/a/text()')
                print(panduan(shop))
                shangquan = panduan(shop)
                lianjia_dict["商圈"] = shangquan

                #详情页url:
                href = li.xpath('.//a[@class="resblock-room"]/@href')[0].replace("/loupan/","")
                detail_url = full_url+href
                # print(detail_url)
                response = requests.get(url=detail_url,headers=headers,proxies=proxies).content.decode('utf-8')
                detail_tree = etree.HTML(response)
                # 访问详情页
                # (5)户型
                li_list = detail_tree.xpath('//li[@data-index="0"]/ul')
                huxinglist = []
                chaoxianglist = []
                imglist = []
                for li in li_list:

                    #户型
                    huxing = li.xpath('.//div[@class="content-title"]/text()')
                    huxinglist.append(panduan(huxing).strip())
                    #朝向
                    chaoxiang = li.xpath('.//div[@class="content-area"]/text()')
                    chaoxianglist.append(panduan(chaoxiang))
                    #img
                    img = li.xpath('.//img/@src')[0]
                    imglist.append(img)
                print(imglist)
                lianjia_dict["户型图片"] = imglist

                #将户型朝向以键值对的形式存入字典中
                huchao_list = []
                for huxing,chaoxiang in zip(huxinglist,chaoxianglist):
                    dict = {}
                    dict[huxing] = chaoxiang
                    huchao_list.append(dict)
                print(huchao_list)
                lianjia_dict["户型朝向"] = huchao_list
                #楼盘详情信息
                xhref = detail_tree.xpath('.//div[@class="more-building"]/a/@href')[0]
                href = full_url.replace('/loupan/','')+xhref
                # print(href)
                response_xinxi = requests.get(url=href, headers=headers,proxies=proxies).content.decode('utf-8')
                xinxi_tree = etree.HTML(response_xinxi)
                xinxi_list = xinxi_tree.xpath('//div[@class="big-left fl"]')
                for xinxi in xinxi_list:
                    # 售楼处地址
                    shoulou = xinxi.xpath('.//ul[@class="x-box"][1]/li[@class="all-row"][2]/span[@class="label-val"]/text()')
                    sl = panduan(shoulou)
                    print(sl)
                    lianjia_dict["售楼处地址"] = sl

                    # 开发商
                    kaifa = xinxi.xpath('.//ul[@class="x-box"][1]/li[@class="all-row"][3]/span[@class="label-val"]/text()')
                    kf = panduan(kaifa)
                    print(kf)
                    lianjia_dict["开发商"] = kf
                    # 物业公司
                    wuye = xinxi.xpath('.//ul[@class="x-box"][3]//span[@class="label-val"]/text()')
                    wy = panduan(wuye)
                    print(wy)
                    lianjia_dict["物业公司"] = wy
                    # 最新开盘
                    kaipan = xinxi.xpath('./ul[@class="fenqi-ul"]/li[3]/span[@class="fq-td fq-open"]/span/text()')
                    kp = panduan(kaipan)
                    print(kp)
                    lianjia_dict["最新开盘"] = kp
                    # 物业类型
                    wuyetype = xinxi.xpath('.//ul[@class="x-box"][2]//span[@class="label-val"]/text()')
                    wuyet = panduan(wuyetype)
                    print(wuyet)
                    lianjia_dict["物业类型"] = wuyet
                    # 交房时间
                    jiaofangtime = xinxi.xpath('./ul[@class="fenqi-ul"]/li[@class="fq-nbd"]/span[@class="fq-td fq-open"]/span/text()')
                    jf = panduan(jiaofangtime)
                    print(jf)
                    lianjia_dict["交房时间"] = jf
                    # # 容积率
                    rongji = xinxi.xpath('./ul[@class="x-box"][2]/li[4]/span[@class="label-val"]/text()')
                    rj = panduan(rongji).strip()
                    print(rj)
                    lianjia_dict["容积率"] = rj
                    # 产权年限
                    chanquan = xinxi.xpath('.//ul[@class="x-box"][2]/li[8]/span[@class="label-val"]/text()')
                    cq = panduan(chanquan).strip()
                    print(cq)
                    lianjia_dict["产权年限"] = cq
                    # 绿化率
                    lvhua = xinxi.xpath('.//ul[@class="x-box"][2]/li[2]/span[@class="label-val"]/text()')
                    lh = panduan(lvhua).strip()
                    print(lh)
                    lianjia_dict["绿化率"] = lh
                    # 规划户数
                    yonghu = xinxi.xpath('.//ul[@class="x-box"][2]/li[7]/span[@class="label-val"]/text()')
                    yh = panduan(yonghu)
                    print(yh)
                    lianjia_dict["规划户数"] = yh
                    # 物业费用
                    wuyefei = xinxi.xpath('.//ul[@class="x-box"][3]/li[3]/span[@class="label-val"]/text()')
                    wyf = panduan(wuyefei)
                    print(wyf)
                    lianjia_dict["物业费用"] = wyf
                    # 车位情况
                    chewei = xinxi.xpath('.//ul[@class="x-box"][3]/li[7]/span[@class="label-val"]/text()')
                    cw = panduan(chewei).strip()
                    print(cw)
                    lianjia_dict["车位情况"] = cw
                    # 供暖方式
                    gongnuan = xinxi.xpath('.//ul[@class="x-box"][3]/li[4]/span[@class="label-val"]/text()')
                    gn = panduan(gongnuan)
                    print(gn)
                    lianjia_dict["供暖方式"] = gn
                    # 供水方式
                    gongshui = xinxi.xpath('.//ul[@class="x-box"][3]/li[5]/span[@class="label-val"]/text()')
                    gs = panduan(gongshui)
                    print(gs)
                    lianjia_dict["供水方式"] = gs
                    # 供电方式
                    gongdian = xinxi.xpath('.//ul[@class="x-box"][3]/li[6]/span[@class="label-val"]/text()')
                    gd = panduan(gongdian)
                    print(gd)
                    lianjia_dict["供电方式"] = gd
                    # # 嫌恶设施
                    # 占地面积
                    mianji = xinxi.xpath('.//ul[@class="x-box"][2]/li[3]/span[@class="label-val"]/text()')
                    mj = panduan(mianji).strip()
                    print(mj)
                    lianjia_dict["占地面积"] = mj
                    #存入
                    p = PyMysql()
                    sql = 'insert into lianjia_data(img,name,price,area,address,shangquan,huxinimg,huxingdata,shoulouadd,kaifa,wuye,kaipan,jianzhutype,jiaofangtime,rongji,chanquan,lvhau,usernum,wuyefei,chewei,gn,gs,gd,jzarea) values ("{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}")'.format(tupian,loupan,price_new,jianzhu,quyu,shangquan,imglist,huchao_list,sl,kf,wy,kp,wuyet,jf,rj,cq,lh,yh,wyf,cw,gn,gs,gd,mj)
                    p.sql_caozuo(sql)
                    # print(lianjia_dict)
                    lianjia_list.append(lianjia_dict)
                # print(lianjia_list)
            # data =json.dumps(lianjia_list,ensure_ascii=False)
            # with open('lianjia.json','a',encoding='utf-8') as fp:
            #         fp.write(data)

if __name__ == '__main__':
    proxies = {'http': '125.110.90.93:9000', 'https': ''}
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36', }

    url2 = 'https://ajax.api.lianjia.com/config/cityConfig/getConfig?callback=jQuery1111039566567830421073_1574853147759&type=province&category=1&_=1574853147760'

    p = PyMysql()
    ##删除语句
    # p.sql_caozuo('drop tables demo03')
    ##建表语句
    p.sql_caozuo(
        'create table lianjia_data(id int primary key auto_increment,img varchar(255),name varchar(255),price varchar(255),area varchar(255),address varchar(255),shangquan varchar(255),huxinimg varchar(5555),huxingdata varchar(5555),shoulouadd varchar(255),kaifa varchar(255),wuye varchar(255),kaipan varchar(255),jianzhutype varchar(255),jiaofangtime varchar(255),rongji varchar(255),chanquan varchar(255),lvhau varchar(255),usernum varchar(255),wuyefei varchar(255),chewei varchar(255),gn varchar(255),gs varchar(255),gd varchar(255),jzarea varchar(255))')
    ##插入数据语句

    pinyin(url2,headers,proxies)

16.淘宝

import requests
import re
import json
from lxml import etree

def panduan(obj):
    if obj:
        return obj
    else:
        return ''

def taobao(url,headers,proxies):
    response = requests.get(url=url,headers=headers,proxies=proxies).content.decode('utf8')
    pattern = re.compile(r'\((.*)\)',re.S)
    data = pattern.findall(response)[0].strip()
    data = json.loads(data)
    data = data["result"]
    for k,v in data.items():
        dict = {}
        result = v["result"]
        for i in result:

            dict["价格"] = i["item_current_price"]
        # print(dict)
        # print(result)


if __name__ == '__main__':
    datalist = [
        'https://tce.taobao.com/api/mget.htm?callback=jsonp1579&tce_sid=1870316,1871653&tce_vid=2,2&tid=,&tab=,&topic=,&count=,&env=online,online',
        'https://tce.taobao.com/api/mget.htm?callback=jsonp1666&tce_sid=1870321,1871654&tce_vid=2,2&tid=,&tab=,&topic=,&count=,&env=online,online',
        'https://tce.taobao.com/api/mget.htm?callback=jsonp1753&tce_sid=1870333,1871655&tce_vid=2,2&tid=,&tab=,&topic=,&count=,&env=online,online',
        'https://tce.taobao.com/api/mget.htm?callback=jsonp1840&tce_sid=1870340,1871656&tce_vid=2,2&tid=,&tab=,&topic=,&count=,&env=online,online',
        'https://tce.taobao.com/api/mget.htm?callback=jsonp1927&tce_sid=1870341,1871659&tce_vid=2,2&tid=,&tab=,&topic=,&count=,&env=online,online',
        'https://tce.taobao.com/api/mget.htm?callback=jsonp2014&tce_sid=1870342,1871657&tce_vid=2,2&tid=,&tab=,&topic=,&count=,&env=online,online',
        'https://tce.taobao.com/api/mget.htm?callback=jsonp2101&tce_sid=1870343,1871658&tce_vid=2,2&tid=,&tab=,&topic=,&count=,&env=online,online']

    proxies = {'http': '125.110.90.93:9000', 'https': ''}
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36', }
    for url in datalist:
        taobao(url,headers,proxies)

17.网易


# https://music.163.com/#/discover/artist/

import requests
import json
import re
from lxml import etree

def wangyi(url,headers,proxies):
    response = requests.get(url=url,headers=headers,proxies=proxies).content.decode('utf8')
    # with open('wangyi.html','w',encoding='utf-8') as fp:
    #     fp.write(response)
    # 获取男女歌手链接
    tree = etree.HTML(response)
    sing_list = tree.xpath('//div[@id="singer-cat-nav"]//a/text()')
    # print(sing_list)

    #获取url
    sing_url_list = tree.xpath('//div[@id="singer-cat-nav"]//a/@href')
    # print(sing_url_list)

    for singer,singer_url in zip(sing_list,sing_url_list):
        sing_full_url = 'https://music.163.com'+singer_url
        response = requests.get(url=sing_full_url,headers=headers,proxies=proxies).content.decode('utf8')
        tree = etree.HTML(response)
        #提取字母链接
        letter_href_list = tree.xpath('//ul[@id="initial-selector"]/li[position()>1]/a/@href')
        for letter_url in letter_href_list:
            letter_ful_url = 'https://music.163.com'+letter_url
            response = requests.get(url=letter_ful_url,headers=headers,proxies=proxies).content.decode('utf8')
            tree = etree.HTML(response)
            #提取歌手名字
            name_list = tree.xpath(
                '//ul[@class="m-cvrlst m-cvrlst-5 f-cb"]/li//a[@class="nm nm-icn f-thide s-fc0"]/text()')
            href_list = tree.xpath(
                '//ul[@class="m-cvrlst m-cvrlst-5 f-cb"]/li//a[@class="nm nm-icn f-thide s-fc0"]/@href')
            for name, j in zip(name_list, href_list):
                full_href = 'https://music.163.com' + j.strip()
                print(name,full_href)
if __name__ == '__main__':
    proxies = {'http': '125.110.90.93:9000', 'https': ''}
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36', }
    url = 'https://music.163.com/discover/artist/cat'
    wangyi(url,headers,proxies)

18.智联+有界面 

import requests
import re
import json
from lxml import etree
import time
from time import strftime

import pymysql
import random

class PyMysql:
    def __init__(self):
        ##1.链接数据库
        self.db = pymysql.connect(user='root',host='localhost',password='123456',database='lianjia')
        ##2.创建游标
        self.c = self.db.cursor()
    def sql_caozuo(self,sql):
        ##3.执行sql语句
        self.c.execute(sql)
        ##查看回执
        print(self.c.fetchall())
    ##__del__魔术方法在主程序运行结束后,开始运行垃圾回收机制时候运行.
    def __del__(self):
        self.db.commit()
        self.c.close()
        self.db.close()


def zhilian(url,headers,proxies):
    response = requests.get(url=url,headers=headers,proxies=proxies).content.decode('utf8')
    data = json.loads(response)
    # print(data["data"]["results"])
    data_list = []
    for job in data["data"]["results"]:
        job_dict = {}
        # print(job)

        #岗位名称
        jobname = job["jobName"]
        print(jobname)
        job_dict["岗位名称"] = jobname
        #公司名称
        companyname = job["company"]["name"]
        print(companyname)
        job_dict["公司名称"] = companyname
        #公司人数
        companynum = job["company"]["size"]["name"]
        print(companynum)
        job_dict["公司人数"] = companynum
        #公司类型
        companytype = job["company"]["type"]["name"]
        print(companytype)
        job_dict["公司类型"] = companytype
        #技能要求
        positionLabel = job["positionLabel"]
        data = json.loads(positionLabel)
        skillLabel = data["skillLabel"]
        label_list = []
        try:
            if skillLabel[0]:
                for label in skillLabel:
                    label = label["value"]
                    label_list.append(label)
        except:
            label_list.append("None")
        label = label_list
        job_dict["技能要求"] = label_list

        #详情url
        detail_url = job["positionURL"]
        print(detail_url)
        response = requests.get(url=detail_url,headers=headers,proxies=proxies).content.decode("utf8")
        # time.sleep(5)
        # print(response)
        tree = etree.HTML(response)

        # 职业描述
        zhize_list = tree.xpath('.//div[@class="describtion"]/div[@class="describtion__detail-content"]/text()')
        print(zhize_list)
        job_dict["职业描述"] = zhize_list

        # 工作地点
        display = job["city"]["display"]
        businessArea = job["businessArea"]
        bs = businessArea
        ds = display
        area = ds + bs
        print(area)
        job_dict["工作地点"] = area
        # 薪资水平
        salary = job["salary"]
        print(salary)
        job_dict["薪资水平"] = salary
        # 任职资格
        # 学历
        eduLevel = job["eduLevel"]["name"]
        # 工作经验
        workingExp = job["workingExp"]["name"]
        zige = eduLevel + workingExp
        print(zige)
        job_dict["任职资格"] = zige
        # 更新发布时间
        updateDate = job["updateDate"]
        fabutime = updateDate
        print(fabutime)
        job_dict["更新发布时间"] = fabutime
        #爬取人
        crawl_name = "高祎曼"
        print(crawl_name)
        job_dict["爬取人"] = crawl_name
        #爬取时间
        paqutime = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime())
        print(paqutime)
        job_dict["爬取时间"] = paqutime
        data_list.append(job_dict)
        paquwangzhan = "智联"
        # p = PyMysql()
        # sql = 'insert into zhilian_data(岗位名称,公司名称,公司人数,公司类型,技能要求,职业描述,工作地点,薪资水平,任职资格,更新发布时间,爬取人,爬取时间,爬取网站) values ("{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}")'.format(jobname,companyname,companynum,companytype,label,zhize_list,area,salary,zige,fabutime,crawl_name,paqutime,paquwangzhan)
        # p.sql_caozuo(sql)

    # data = json.dumps(data_list,ensure_ascii=False)
    # with open('zhilian.json','w',encoding='utf-8') as fp:
    #     fp.write(data)



if __name__ == '__main__':
    proxies = {'http': '125.110.90.93:9000', 'https': ''}
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',
        'cookie':'guid=7575-be2f-7c15-c306; amap_ver=1536672475634; key=6a7665aa7301eae686d9e79884d0445b'}
    url = 'https://fe-api.zhaopin.com/c/i/sou?pageSize=90&cityId=all&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=python%E7%88%AC%E8%99%AB%E5%B7%A5%E7%A8%8B%E5%B8%88&kt=3&=0&_v=0.15635057&x-zp-page-request-id=fd5128b31ef84aceb1e84d41097963e1-1575100818571-464999&x-zp-client-id=04d803dc-74e4-4170-cf84-d3cdc5607b84&MmEwMD=4T6BXuvx_UOBZhGW5PAnis7UzYfOFZcUeeZ1Hq9jrdCULnycfzc9C7nw5eIOH2F4HihMYmky5F4y4wzH6vHJoJaLwKt2PX9n9eLVZASrAFRl3WXEqNzECxIt2if_0tHRqzNsYsSgp68KeoERtc2Pj8bQuiUJzPKbJJf5kIr_sReC3iAmhs5AkB.q55sS343U6.eC7.kfI2sb_.vTfBfDzJ6zSB_lgyMFXwMfmYMZMTDoKyG0JCeuMjUTVC1k0gGNhVbeLOCCzRjcSTvAKFfbPxnO.W5mSiOgXGCdZiT5Bxlz1piZ.6kc3N4J6QqObZblvoscyt.v0zGieC_8PYEqsj2ztAZNBcbreqmYxmNhVIQzljDX0LhKnaRdfJ5AowwtwYMUf9eWYMcdlkJkv9ton7pbV'
    # p = PyMysql()
    ##建表语句
    # p.sql_caozuo(
    #     'create table lianjiadata(id int primary key auto_increment,)')
    zhilian(url,headers,proxies)

19.拉钩



import pymysql
import random

class PyMysql:
    def __init__(self):
        ##1.链接数据库
        self.db = pymysql.connect(user='root',host='localhost',password='123456',database='lianjia')
        ##2.创建游标
        self.c = self.db.cursor()
    def sql_caozuo(self,sql):
        ##3.执行sql语句
        self.c.execute(sql)
        ##查看回执
        print(self.c.fetchall())
    ##__del__魔术方法在主程序运行结束后,开始运行垃圾回收机制时候运行.
    def __del__(self):
        self.db.commit()
        self.c.close()
        self.db.close()

import requests
from lxml import etree
import json
import re
import math
import time

def panduan(obj):
    if obj:
        return obj[0]
    else:
        return 'None'


def lagou(url,headers,proxies):
    time.sleep(3)
    response = requests.post(url=url,headers=headers,proxies=proxies).text
    # data_list = json.loads(response)
    print(response)






if __name__ == '__main__':
    proxies = {'http': '125.110.90.93:9000', 'https': ''}
    usag_list = [
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36',
        'Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12',
    ]
    usag = random.choice(usag_list)
    headers = {
        'User-Agent': usag,
        'cookie':'user_trace_token=20191129155328-84048293-4801-4663-91ae-9695ac9c4693; _ga=GA1.2.1490834596.1575014072; LGUID=20191129155329-54ea0923-127d-11ea-a68e-5254005c3644; _gid=GA1.2.898573975.1575014080; index_location_city=%E5%8C%97%E4%BA%AC; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216eb627d70c176-0beb948187e8ff-32365f08-1049088-16eb627d70d272%22%2C%22%24device_id%22%3A%2216eb627d70c176-0beb948187e8ff-32365f08-1049088-16eb627d70d272%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; JSESSIONID=ABAAABAAADEAAFIA1EE71982D903A109AFEFF779585C4FA; WEBTJ-ID=20191130095146-16eba01bb9d47a-020b149bcc6827-32365f08-1049088-16eba01bb9e36d; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1575014080,1575030607,1575033330,1575078706; LGSID=20191130095042-d17fd70b-1313-11ea-a68e-5254005c3644; PRE_UTM=; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DwzB7mu7hBPcdl6Tni88-qBT6Hm86I74H5shBvm7Ugzi%26wd%3D%26eqid%3D962e999d0007d16a000000025de1caeb; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; TG-TRACK-CODE=search_code; SEARCH_ID=1a48505a36e843569e84500c2600b603; _gat=1; X_HTTP_TOKEN=440fb692ec96ea8037997057518ccbba7f70585ce7; LGRID=20191130101253-ead576cf-1316-11ea-a68e-5254005c3644; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1575080037'

    }

    for i in range(1,8):
        url = 'https://www.shixiseng.com/app/interns/search/v2?build_time=1575096760616&page={}&keyword=python%E7%88%AC%E8%99%AB&type=intern&area=&months=&days=&degree=&official=&enterprise=&salary=-0&publishTime=&sortType=&city=%E5%8C%97%E4%BA%AC&internExtend='.format(i)
        lagou(url, headers, proxies)
    # p = PyMysql()
    ##建表语句
    # p.sql_caozuo(
    #     'create table lianjiadata(id int primary key auto_increment,img varchar(255),name varchar(255),price varchar(255),area varchar(255),address varchar(255),shangquan varchar(255),huxinimg varchar(5555),huxingdata varchar(5555),shoulouadd varchar(255),kaifa varchar(255),wuye varchar(255),kaipan varchar(255),jianzhutype varchar(255),jiaofangtime varchar(255),rongji varchar(255),chanquan varchar(255),lvhau varchar(255),usernum varchar(255),wuyefei varchar(255),chewei varchar(255),gn varchar(255),gs varchar(255),gd varchar(255),jzarea varchar(255))')
    ##插入数据语句


20.无界面

from selenium import webdriver
import time

# (1)将selenium和浏览器组合
driver = webdriver.PhantomJS(executable_path=r'D:\phantomjs-2.1.1-windows\bin\phantomjs.exe')

# (2)访问网站
url = 'https://www.baidu.com/'
driver.get(url=url)

# (3)拍照
driver.save_screenshot('baidu.png')

# (4)找到输入框,模拟输入:
driver.find_element_by_id('kw').send_keys('吴亦凡')

#(5)拍照:
driver.save_screenshot('baidu02.png')

# (6)模拟点击
driver.find_element_by_id('su').click()
time.sleep(3)
driver.save_screenshot('baidu03.png')

 

 

评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值