Python-Requests爬虫实例

requests破解百度翻译

# -*- coding: utf-8 -*-
import requests
import json

if __name__ == '__main__':
    url = 'https://fanyi.baidu.com/basetrans'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Mobile Safari/537.36',
        'Referer':'https://fanyi.baidu.com/?aldtype=16047',
        'Cookie':'BIDUPSID=1E64848A7B840FDD0E2923266C07A98A; PSTM=1616913998; BAIDUID=1E64848A7B840FDD5C099A513854FDE2:FG=1; FANYI_WORD_SWITCH=1; REALTIME_TRANS_SWITCH=1; HISTORY_SWITCH=1; SOUND_PREFER_SWITCH=1; SOUND_SPD_SWITCH=1; td_cookie=1233489163; __yjs_duid=1_9f4322f1a1a2759cc92311f6041071d81619750212245; H_PS_PSSID=33985_33969_31254_33848_33607_26350_33892; BAIDUID_BFESS=1E64848A7B840FDD5C099A513854FDE2:FG=1; delPer=0; PSINO=3; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BCLID=7792273175421115645; BDSFRCVID=mU8OJexroG38EYQe91zlhKIRQuweG7bTDYLEOwXPsp3LGJLVJeC6EG0Pts1-dEu-EHtdogKKLgOTHULF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tR3aQ5rtKRTffjrnhPF3Q-LvXP6-hnjy3bAOKxTt5CT-SRrdyUrC0JLWbttf5q3RymJJ2-39LPO2hpRjyxv4y4Ldj4oxJpOJ-bCL0p5aHl51fbbvbURvX--g3-7PWU5dtjTO2bc_5KnlfMQ_bf--QfbQ0hOhqP-jBRIEoCvt-5rDHJTg5DTjhPrMMhjrWMT-MTryKK8y3xTGeDTC3losyUFw0qofKx-fKHnRhlRNB-3iV-OxDUvnyxAZyxomtfQxtNRJQKDE5p5hKq5S5-OobUPUXMJ9LUvPBgcdot5yBbc8eIna5hjkbfJBQttjQn3hfIkj2CKLtCvDqTrP-trf5DCShUFsLJbJB2Q-XPoO3K8WsfTPbjua24AJyPrNQRQf5mkf3fbgylRM8P3y0bb2DUA1y4vpBtQmJeTxoUJ2-KDVeh5Gqfo15-0ebPRiWPb9QgbP2pQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0hDvPKITD-tFO5eT22-usaGIO2hcHMPoosI89QqrGbjcL2xv3BMcNL6Tf0l05KfbUoqRHXnJi0btQDPvxBf7pWDTm_q5TtUJMqIDzbMohqfLn5MOyKMniBIv9-pnGBpQrh459XP68bTkA5bjZKxtq3mkjbPbDfn028DKu-n5jHjjbjG_83H; BCLID_BFESS=7792273175421115645; BDSFRCVID_BFESS=mU8OJexroG38EYQe91zlhKIRQuweG7bTDYLEOwXPsp3LGJLVJeC6EG0Pts1-dEu-EHtdogKKLgOTHULF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF_BFESS=tR3aQ5rtKRTffjrnhPF3Q-LvXP6-hnjy3bAOKxTt5CT-SRrdyUrC0JLWbttf5q3RymJJ2-39LPO2hpRjyxv4y4Ldj4oxJpOJ-bCL0p5aHl51fbbvbURvX--g3-7PWU5dtjTO2bc_5KnlfMQ_bf--QfbQ0hOhqP-jBRIEoCvt-5rDHJTg5DTjhPrMMhjrWMT-MTryKK8y3xTGeDTC3losyUFw0qofKx-fKHnRhlRNB-3iV-OxDUvnyxAZyxomtfQxtNRJQKDE5p5hKq5S5-OobUPUXMJ9LUvPBgcdot5yBbc8eIna5hjkbfJBQttjQn3hfIkj2CKLtCvDqTrP-trf5DCShUFsLJbJB2Q-XPoO3K8WsfTPbjua24AJyPrNQRQf5mkf3fbgylRM8P3y0bb2DUA1y4vpBtQmJeTxoUJ2-KDVeh5Gqfo15-0ebPRiWPb9QgbP2pQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0hDvPKITD-tFO5eT22-usaGIO2hcHMPoosI89QqrGbjcL2xv3BMcNL6Tf0l05KfbUoqRHXnJi0btQDPvxBf7pWDTm_q5TtUJMqIDzbMohqfLn5MOyKMniBIv9-pnGBpQrh459XP68bTkA5bjZKxtq3mkjbPbDfn028DKu-n5jHjjbjG_83H; Hm_lvt_afd111fa62852d1f37001d1f980b6800=1620813569,1620819708; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1620813284,1620813294,1620819708,1620819712; __yjs_st=2_M2I0MjI3ZTEwM2UyZjAwZDJhYjljMDdhNGFiNDhkZTgwNjY0Y2U0NGYzMjBiZWY1NzJkYWYxMzA3ZWIzNWM1ZTE5ZDhhYzFiNGE2MzllZjBlMjMyMzEyM2UyNjA1ZDRkZjI5ZjI5N2ZjNzVlOTIyMWIzNDNiMmMxZWEzYTA5Y2M4NDBiNmRhZjRjMjZkNDJiODNlNTlmN2E5ZjVkNmYwZGEyZTliYWNiZDk4ZDhkYWMzZDYwYTY0ZjE1MjkxYTMzN2I2MjhhODE1M2UxNDBlZDQzYTIwOWIwNjczY2U3OTA0NWU3YmE0ZWQ1MGU3YzNhYzYxMmU1ZDAxNDhkMWU1NV83XzI4N2NlODFh; Hm_lpvt_afd111fa62852d1f37001d1f980b6800=1620819723; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1620819723; ab_sr=1.0.0_ZmE3YWU3ZTk4M2E2MTM2OGQ5Y2QwZWM3Njg0NWFkYTIzYTM2MjBmNDQzYjgzYjVhMGFmMTliNzU5ZGRmOGIxZDgzYzdhZWIzZmQ5ZjZlY2YxODk0NTk5ZGY3N2NlMWU4'
    }
    word = input('enter a word:')
    data = {
        'query': word,
        'from': 'en',
        'to': 'zh'
    }
    response = requests.post(url=url, data=data, headers=headers)
    print(response.content.decode())
    dic_obj = response.json()
    print(dic_obj)

requests豆瓣电影排行榜

import requests
import json

if __name__ == '__main__':
    url = 'https://movie.douban.com/j/chart/top_list?type=24&interval_id=100%3A90&action=&start=0&limit=20'
    param = {
        'type': '24',
        'interval_id': '100:90',
        'action':'',
        'start': '0',
        'limit': '20'
    }
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
    }
    response = requests.get(url=url, params=param, headers=headers).json()
    print(response)
    with open('./douban.json','w',encoding='utf-8') as fp:
        json.dump(response,fp=fp,ensure_ascii=False)

requests肯德基地理位置

import requests

if __name__ == '__main__':
    url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
    city = input('输入查询地址:')
    param = {
        'cname':'',
        'pid':'',
        'keyword': city,
        'pageIndex': '1',
        'pageSize': '10'
    }
    header = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
    }
    response = requests.post(url, params=param, headers=header).text
    print(response)

requests许可证信息

import requests

if __name__ == '__main__':
    url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList'
    param ={
        'on': 'true',
        'page': '1',
        'pageSize': '15',
        'productName':'',
        'conditionType': '1',
        'applyname':'',
        'applysn':''
    }
    header = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
    }
    response = requests.post(url, params=param, headers=header).json()
    print(response)
    lis_ID = response['list']
    url2 = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById'
    for item in lis_ID:
        id = item['ID']
        param2 = {
            'id': id
        }
        header2 ={
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
        }
        response2 = requests.post(url2, params=param2, headers=header2).json()
        print(response2['epsName'])

### Python 爬虫抓取豆瓣电影TOP250影评实例教程 #### 准备工作 为了成功获取豆瓣电影TOP250的影评信息,需先安装必要的库。主要依赖`requests`用于发送HTTP请求以及`BeautifulSoup`解析HTML文档。 ```bash pip install requests beautifulsoup4 openpyxl ``` #### 获取页面内容 通过向目标URL发起GET请求并处理响应数据来实现网页内容读取功能: ```python import requests from bs4 import BeautifulSoup def get_page_content(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)', } response = requests.get(url, headers=headers) if response.status_code == 200: return response.text else: raise Exception(f"Failed to load page {url}") ``` 此部分代码定义了一个函数`get_page_content()`用来接收网址参数,并返回该地址对应的网页源码字符串[^1]。 #### 解析单页中的电影条目 对于每一页上的多个电影项目,可以采用循环遍历的方式逐一提取所需字段: ```python def parse_movie_items(html): soup = BeautifulSoup(html, "html.parser") movie_list = [] items = soup.select('div.item') for item in items: title = item.find("span", class_="title").text.strip() rating_num = float(item.find("span", class_="rating_num").text.strip()) review_link_tag = item.find_all("a")[1] reviews_url = f"https://movie.douban.com{review_link_tag['href']}" movie_info = {"title": title, "rating": rating_num, "reviews_url": reviews_url} movie_list.append(movie_info) return movie_list ``` 上述方法实现了对单个页面内所有电影项目的解析操作,从中抽取标题、评分及评论链接等基本信息[^2]。 #### 抓取多页数据 由于榜单分为若干子页面展示,因此还需要考虑如何翻阅不同分页的内容: ```python base_url = "https://movie.douban.com/top250?start={}&filter=" all_movies = [] for i in range(0, 250, 25): # 每次加载25部新片 url = base_url.format(i) html = get_page_content(url) movies_on_this_page = parse_movie_items(html) all_movies.extend(movies_on_this_page) ``` 这里展示了完整的迭代过程,从首页开始直到最后一项为止逐步累积全部记录到列表变量中[^3]。 #### 访问影评区并保存至文件 最后一步就是针对每一个已知的影片访问其详细的影评区域并将收集的数据存储下来: ```python import time from random import randint import pandas as pd def fetch_reviews_and_save_to_excel(all_movies): data_for_export = [] for idx, movie in enumerate(all_movies[:5]): # 只做前五部测试用 try: print(f"Now fetching reviews of '{movie['title']}'...") review_html = get_page_content(movie["reviews_url"]) soup_review = BeautifulSoup(review_html, "html.parser") short_comments_section = soup_review.find(id='hot-comments').find_all(class_='comment-item') comments_data = [{"Movie Title": movie['title'], "Commenter Name": comment.find(class_='comment-info').a.string, "Rating Stars": int(comment.find(class_='comment-info').span.attrs.get('class', [''])[0].replace('allstar','')[:-1]) / 10, "Short Comment": comment.p.string} for comment in short_comments_section] data_for_export += comments_data df = pd.DataFrame(data_for_export) excel_writer = pd.ExcelWriter('douban_top250_reviews.xlsx', engine='openpyxl') df.to_excel(excel_writer, index=False) excel_writer.save() sleep_time = randint(3,7) print(f'Sleeping for {sleep_time}s before next request...') time.sleep(sleep_time) except Exception as e: print(e) fetch_reviews_and_save_to_excel(all_movies) ``` 这段脚本负责打开之前获得的各部作品的具体讨论版面,选取其中一部分热门短评作为样本存入电子表格之中;同时为了避免触发反爬机制,在每次连续两次请求间加入了随机延时等待间隔。
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

独角兽小马

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值