使用python爬取网页信息

python爬取网上数据

自己可下载fiddle抓包工具,分析网页接口返回参数值

demo1

##---------------------------------------爬取网页表格信息
from bs4 import BeautifulSoup
import requests
import csv
import bs4
 
#检查url地址
def check_link(url):
   try:
        
       r = requests.get(url)
       r.raise_for_status()
       r.encoding = r.apparent_encoding
       return r.text
   except:
       print('无法链接服务器!!!')
 
 
#爬取资源
def get_contents(ulist,rurl):
   soup = BeautifulSoup(rurl,'lxml')
   trs = soup.find_all('tr')
   for tr in trs:
       ui = []
       for td in tr:
           ui.append(td.string)
       ulist.append(ui)
    
#保存资源
def save_contents(urlist):
   with open("D:/2016年中国企业500强排行榜.csv",'w',newline='') as f:
       writer = csv.writer(f)
       writer.writerow(['中国企业500强排行榜'])
       for i in range(len(urlist)):
           writer.writerow([urlist[i][1],urlist[i][3],urlist[i][5]])
 
def main():
   urli = []
   url = "http://www.maigoo.com/news/463071.html"
   rs = check_link(url)
   get_contents(urli,rs)
   save_contents(urli)
 
main()

爬取新闻文章标题

import requests
from lxml import etree

html = requests.get("https://blog.youkuaiyun.com/it_xf?viewmode=contents")

etree_html = etree.HTML(html.text)
#获取想要的信息
etree_html=etree_html.xpath('//*[@class="mainBox"]/main/div[2]/div/h4/a/text()')

for each in etree_html:
   replace = each.replace('\n', '').replace(' ', '')
   if replace == '\n' or replace == '':
       continue
   else:
       print(replace)

爬取网页表格数据,生成exel表格,实现数据统计

#request模块
import urllib.request
# 导入正则匹配包
import re
import csv
import numpy as np

url="https://www.ittime.com.cn/news/zixun.shtml"
url_yuan=urllib.request.urlopen(url).read().decode("utf-8","ignore")   #源码
#根据网页结构,获取数据
imgRe=re.compile(r'src="(.*?\.jpg)"')
titleRe=re.compile(r'<h2><a href=".*?" target="_blank">(.*?)</a></h2>')
contentRe=re.compile(r'<p>(.*?)</p>')
authorRe=re.compile(r'<span class="pull-left from_ori">(.*?)<span class="year">(.*?)</span></span>')

# 匹配网页对应的标题数据
titles=titleRe.findall(url_yuan)
images=imgRe.findall(url_yuan)
content=contentRe.findall(url_yuan)
authors=authorRe.findall(url_yuan)
#拼接页面上需要的字段
resultList= []
for i in range(len(titles)):
    list=[]
    resultList_i=[titles[i],authors[i],images[i],images[i]]
    resultList.append(resultList_i)
#打印成exel  
with open("D:/博客园文章信息.csv",'w',newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['博客园文章信息'])
        for i in range(len(resultList)):
            writer.writerow([resultList[i][0],resultList[i][1],resultList[i][2],resultList[i][3]]) 
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值