BeautifulSoup解析H5——python爬虫

壶小旭

已于 2022-11-03 20:36:31 修改

阅读量418

点赞数

分类专栏： Python 文章标签： beautifulsoup python 爬虫

于 2022-11-03 20:32:54 首次发布

本文链接：https://blog.youkuaiyun.com/qq_38531623/article/details/127677760

版权

Python 专栏收录该内容

3 篇文章

订阅专栏

import requests
from bs4 import BeautifulSoup

"""
description：爬取東方語言學網
word：待查字
zu：閩語|吳語|平話|客家|贛語|官話
"""


def crawl_main(word, zu):
    url = "http://eastling.org/fangyan_word_go.php"
    payload = {'word': word,
               'zu': zu,
               'mode': 'word',
               'map': '查 詢'}
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36",
    }

    ret = requests.request("POST", url, headers=headers, data=payload)
    ret.encoding = ret.apparent_encoding  # 指定编码等于原始页面编码

    # print(ret.text)
    soup = BeautifulSoup(ret.text, 'html.parser')
    tables = soup.findAll('table')[1]  # 指定采集第二个table的信息

    trs = soup.findAll('table')[1].findAll('tr')
    heads = []  # 表头

    for tr in trs:

        ths = tr.findAll('th')
        for th in ths:
            heads.append(th.get_text())
        break
    cols = []
    for tr in trs:

        col = []
        ths = tr.findAll('td')
        for th in ths:
            col.append(th.get_text())
        cols.append(col)
    res = {'heads': heads, 'cols': cols[1:]}
    return res


if __name__ == '__main__':
    print(crawl_main("好", "粵語"))

关注博主即可阅读全文