python爬取网上数据
自己可下载fiddle抓包工具,分析网页接口返回参数值
demo1
##---------------------------------------爬取网页表格信息
from bs4 import BeautifulSoup
import requests
import csv
import bs4
#检查url地址
def check_link(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print('无法链接服务器!!!')
#爬取资源
def get_contents(ulist,rurl):
soup = BeautifulSoup(rurl,'lxml')
trs = soup.find_all('tr')
for tr in trs:
ui = []
for td in tr:
ui.append(td.string)
ulist.append(ui)
#保存资源
def save_contents(urlist):
with open("D:/2016年中国企业500强排行榜.csv",'w',newline='') as f:
writer = csv.writer(f)
writer.writerow(['中国企业500强排行榜'])
for i in range(len(urlist)):
writer.writerow([urlist[i][1],urlist[i][3],urlist[i][5]])
def main():
urli = []
url = "http://www.maigoo.com/news/463071.html"
rs = check_link(url)
get_contents(urli,rs)
save_contents(urli)
main()
爬取新闻文章标题
import requests
from lxml import etree
html = requests.get("https://blog.youkuaiyun.com/it_xf?viewmode=contents")
etree_html = etree.HTML(html.text)
#获取想要的信息
etree_html=etree_html.xpath('//*[@class="mainBox"]/main/div[2]/div/h4/a/text()')
for each in etree_html:
replace = each.replace('\n', '').replace(' ', '')
if replace == '\n' or replace == '':
continue
else:
print(replace)
爬取网页表格数据,生成exel表格,实现数据统计
#request模块
import urllib.request
# 导入正则匹配包
import re
import csv
import numpy as np
url="https://www.ittime.com.cn/news/zixun.shtml"
url_yuan=urllib.request.urlopen(url).read().decode("utf-8","ignore") #源码
#根据网页结构,获取数据
imgRe=re.compile(r'src="(.*?\.jpg)"')
titleRe=re.compile(r'<h2><a href=".*?" target="_blank">(.*?)</a></h2>')
contentRe=re.compile(r'<p>(.*?)</p>')
authorRe=re.compile(r'<span class="pull-left from_ori">(.*?)<span class="year">(.*?)</span></span>')
# 匹配网页对应的标题数据
titles=titleRe.findall(url_yuan)
images=imgRe.findall(url_yuan)
content=contentRe.findall(url_yuan)
authors=authorRe.findall(url_yuan)
#拼接页面上需要的字段
resultList= []
for i in range(len(titles)):
list=[]
resultList_i=[titles[i],authors[i],images[i],images[i]]
resultList.append(resultList_i)
#打印成exel
with open("D:/博客园文章信息.csv",'w',newline='') as f:
writer = csv.writer(f)
writer.writerow(['博客园文章信息'])
for i in range(len(resultList)):
writer.writerow([resultList[i][0],resultList[i][1],resultList[i][2],resultList[i][3]])