以爬取某网站上的ip列表为例:
- 下载并安装postman,体验下自动生成请求头代码是啥感觉
- 获取网站的table, 按h2,th,td的顺序写入csv文件
- 运用bs4实现对tst、html、csv的自由转换
postman
生成请求头
进入postman的官网, 按提示下载安装即可
安装后运行postman,粘贴链接到图示位置, 点击send
加载完后点击code,选择一种语言,完事
自动生成的请求头代码:

分析网页结构:
table->tbody->tr->th, td
th的contents分为两种情况, 一是th的contents为h2(表头),二是行标签(每列数据的标题)
td也分俩种情况,一是contents为空,二是不为空,而遍历bs4.element.Tag时,会自动忽略空值,这就可能导致生成的csv列表某些列的数据和标题错位,解决错位的方法
因contents是list类,而在python中
[] is not None
用if else语句时,通过list是否为等于None来判断是否为空是行不通的, 但可以这样
if td.contents:
pass
else:
iplist.append('占位')
当contents=[], else语句被执行,iplist添加一个值来占位(相当于excle里的右移一格)
csv
读写
csv是python的标准库, 所以直接import, 不需要再去安装
import csv
with open(path, 'w+', newline='') as f:
writer = csv.writer(f)
writer.writerow(line)
writer.writerows(lines)
f.seek(0)
reader = csv.reader(f)
for row in reader:
print(row)
csv中row和rows的区别
line若为单个字符串, 在调用writerow方法时会被自动拆分为list, 如line=“国内高匿代理IP"时,line会变成含8个字符的list( list=[‘国’, ‘内’, ‘高’, ‘匿’, ‘代’, ‘理’, ‘I’, ‘P’] ),csv里看起来是这样的
显然这不是我们想要的结果, 可以改为line=[“国内高匿代理IP”],运行结果如图1-A10单元格所示
同样的道理, writerows一次性写入多行, 如果lines=“国内高匿代理IP”, csv里看起来是这样的,lines会被拆分俩次,变成含8个子列表的list (list=[[‘国’], [‘内’], [‘高’], [‘匿’], [‘代’], [‘理’], [‘I’], [‘P’]]),每个子列表内含一个字符,而一个列表对应csv的一行, 逗号相当于"分列”. 所以最终变成一列八行
writerow和writerows方法的详细区别如“图 1”所示
因进行完写操作后, 读取指针指向的是文件末尾, 故进行读操作时使用了f.seek(0), 重新定位到文件头部
bs4解析
requests, bs4是第三方库, 需要自行安装
新版python用"pip3 install lib"指令安装时, 有时会报错
可以尝试"python -m pip install lib"指令
import requests
from bs4 import BeautifulSoup as bs
rep = requests.get(url)
soup = bs(rep.text, 'lxml')
'''
table = soup.table # 返回所有为table的Tag
tr = soup.tr
th = soup.th
以此类推
也可用findAll()
'''
for tr in soup.findAll('tr'): # 找出所有的tr并遍历,table row,表格的每一行
# findAll()返回所有符合条件的Tag,例如找出id为ip_list的标签:findAll(id='ip_list')
# 找出class为counry的标签,可写作findAll(class_='country'),不要漏掉class_的下划线
# findAll()的A要大写, 也可写成find_all()
# 其他用法类似,自己多尝试几遍就好了
for th in tr.findAll('th'):
# 遍历表的标题行
for td in tr.findAll('td'):
# 遍历每行的每列
完整代码:
代码拆分的比较细, 按功能的实现逐一递进, 所以有点长
# 从网上获取可用IP并保存到本地
import requests, re, os, csv
from bs4 import BeautifulSoup as bs
url = "http://www.xicidaili.com/"
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36",
'Accept': "*/*",
'Cache-Control': "no-cache",
'Accept-Encoding': "gzip, deflate",
'Cookie': "_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJWU2MjhkYmU2MWE1ODZjYmIyMTRmMTRmMWNiYzNkYzJhBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMWdOenZiVWVsTUVmbDNMN1Q1WndjNytpMlQ4SUZITHE2NmcvcHdOMnV4OUk9BjsARg%3D%3D--f7bbf38a9dc3d77201056c354e383f082ed02c2f",
'Referer': "http://www.xicidaili.com/",
'Connection': "keep-alive",
'cache-control': "no-cache"
}
def getrep():
rep = requests.get(url, headers=headers)
soup = bs(rep.text, 'lxml')
title = soup.title.contents[0]
return rep.text, title
def saveastxt(string, basename, filetype='txt'):
abspath = r'C:\Users\QQ\Desktop\ls\py\results\{}.{}'.format(basename, filetype)
dirname = os.path.dirname(abspath)
if not os.path.exists(dirname):
os.mkdirs(dirname)
with open(abspath, 'w') as f:
f.write(string)
# saveastxt(getrep()[0], getrep()[1])
def findall():
path = r'C:/Users/QQ/Desktop/ls/py/results/免费代理IP_HTTP代理服务器IP_隐藏IP_QQ代理_国内外代理_西刺免费代理IP.txt'
with open(path, 'r') as f:
string = f.read()
soup = bs(string, 'lxml')
# iplist = soup.table
# iplist = bs(str(soup.table), 'lxml')
# iplist = re.findall(r'<td[ class="country"]*?>(.+?)?</td>', str(soup.table))
with open(os.path.dirname(path) + r'\iplist.txt', 'w') as f:
#print('tr', len(soup.find_all('tr')), type(soup.find_all('tr')))
for tr in soup.findAll('tr'):
if tr.h2:
f.write('\n' + str(tr.h2.contents[0]))
for th in tr.findAll('th'):
f.write(str(th.contents[0]) + '\t')
for td in tr.findAll('td'):
'''
for i in td.contents:
i=str(i)
if 'img' in i:
i = re.findall(r'alt="(.+?)"', i)[0]
f.write(i + '\t')
'''
for i in td:
if td.img:
f.write(td.img.attrs['alt'] + '\t')
else:
f.write(i + '\t')
print(tr.contents)
f.write('\n')
# print(type(iplist), len(iplist), len(iplist[0]))
# saveastxt(str(iplist), r'iplist', 'html')
# findall()
def saveincsv0():
path = r'C:/Users/QQ/Desktop/ls/py/results/免费代理IP_HTTP代理服务器IP_隐藏IP_QQ代理_国内外代理_西刺免费代理IP.txt'
with open(path, 'r') as f:
string = f.read()
soup = bs(string, 'lxml')
with open(os.path.dirname(path) + r'\iplist.csv', 'w') as f:
# writer = csv.writer(f)
for tr in soup.findAll('tr'):
if tr.h2:
f.write('\n' + str(tr.h2.contents[0]))
for th in tr.findAll('th'):
f.write(str(th.contents[0]) + ',')
for td in tr.findAll('td'):
for i in td:
if td.img:
f.write(td.img.attrs['alt'] + ',')
else:
f.write(td.contents[0] + ',')
print(tr)
f.write('\n')
# saveincsv()0
def saveincsv():
path = r'C:/Users/QQ/Desktop/ls/py/results/免费代理IP_HTTP代理服务器IP_隐藏IP_QQ代理_国内外代理_西刺免费代理IP.txt'
with open(path, 'r') as f:
string = f.read()
soup = bs(string, 'lxml')
with open(os.path.dirname(path) + r'\iplist.csv', 'w', newline='') as f:
# get table contents in web page, wirte in csv
writer = csv.writer(f)
for tr in soup.findAll('tr'):
if tr.h2:
# print(tr.h2.contents)
writer.writerow([])
writer.writerow(tr.h2.contents)
iplist = []
for th in tr.findAll('th'):
if th.contents[0] != ' ':
iplist.append(th.contents[0])
if iplist:
# print(iplist)
writer.writerow(iplist)
iplist.clear()
# print('len(tr.findAll("td")): ', len(tr.findAll('td')))
for td in tr.findAll('td'):
# type(td): <class 'bs4.element.Tag'>
# print(td)
if td.contents:
# print(td)
pass
else:
iplist.append('')
print('!!!Neglected td: ', td, ', added to the iplist')
for i in td:
# 需要注意的是,当遍历bs4.element.Tag时,会自动忽略掉contents=None的Tag
# print(i)
if td.img:
# print('img in there: ', len(td.contents))
iplist.append(td.img.attrs['alt'])
else:
iplist.append(td.contents[0])
if iplist:
if len(iplist) != len(tr.findAll('td')):
print('len(tr.findAll("td")): ', len(tr.findAll('td')),
'len(iplist): ', len(iplist))
writer.writerow(iplist)
with open(os.path.dirname(path) + r'\iplist.csv', 'r') as fv:
# csv to txt
string = fv.read()
fv.close()
with open(os.path.dirname(path) + r'\iplistscv.txt', 'w') as f:
for i in string:
if i == '[' or i == ']':
f.write('\n')
elif i == ',':
f.write('\t')
else:
f.write(i)
# saveincsv()
def getandsaveincsv():
path = r'C:/Users/QQ/Desktop/ls/py/results/免费代理IP_HTTP代理服务器IP_隐藏IP_QQ代理_国内外代理_西刺免费代理IP.txt'
rep = requests.get(url, headers=headers)
string = rep.text
soup = bs(string, 'lxml')
with open(os.path.dirname(path) + r'\iplist.csv', 'w', newline='') as f:
writer = csv.writer(f)
for tr in soup.findAll('tr'):
if tr.h2:
# print(tr.h2.contents)
writer.writerow([])
writer.writerow(tr.h2.contents)
iplist = []
for th in tr.findAll('th'):
if th.contents[0] != ' ':
iplist.append(th.contents[0])
if iplist:
# print(iplist)
writer.writerow(iplist)
iplist.clear()
for td in tr.findAll('td'):
for i in td:
if td.img:
iplist.append(td.img.attrs['alt'])
else:
iplist.append(td.contents[0])
if iplist:
print(iplist)
writer.writerow(iplist)
with open(os.path.dirname(path) + r'\iplist.csv', 'r') as fv:
string = fv.read()
fv.close()
with open(os.path.dirname(path) + r'\iplistscv.txt', 'w') as f:
for i in string:
if i == '[' or i == ']':
f.write('\n')
elif i == ',':
f.write('\t')
else:
f.write(i)
# getandsaveincsv()
def readcsv(path = r'C:/Users/QQ/Desktop/ls/py/results/iplist.csv'):
with open(path, 'r') as f:
reader = csv.reader(f)
for row in reader:
print(row)
# print('finished printing the contents from csv which needs to be read')
# readcsv()
def dictwriteincsv():
path = r'C:/Users/QQ/Desktop/ls/py/results/iplist.csv'
with open(path, 'r') as f:
reader = csv.reader(f)
pedometer = 0
for row in reader:
if row:
if pedometer == 0:
pedometer = 1
pass
elif pedometer == 1:
headers = row
rowsdict = []
pedometer += 1
else:
rowsdict.append(dict(zip(headers, row)))
# print(headers)
# pedometer = 0
# for i in rowsdict:
# print(i.values())
path1 = r'C:/Users/QQ/Desktop/ls/py/results/dictwriteincsv.csv'
with open(path1, 'w+', newline='') as f:
DictWriter = csv.DictWriter(f, headers)
DictWriter.writeheader()
DictWriter.writerows(rowsdict)
f.seek(0)
reader = csv.reader(f)
for row in reader:
print(row)
print('print over')
def main():
# saveastxt(getrep()[0], getrep()[1])
saveincsv()
main()
运行结果
csv
txt
小结
csv可以看作由三部分组成:换行符、逗号、单元格文本。用file.wirte(’\n’),file.wirte(’,’)可以实现csv的换行和分列
#不用csv库实现换行和分列
def testcsv1():
path = r'C:/Users/QQ/Desktop/ls/py/脚本/testresult.csv'
with open(path, 'w+', newline='') as f:
string = ''
for i in range(100):
string += str(i)
f.write(string + ',')
if i%10 == 0:
f.write('\n')
string = ''
testcsv1()
运行结果
csv操作虽然简单,但终究不支持各种格式设置,如果需要生成带格式的表格,可以安装和excel操作相关的模块来实现自动生成 .xls[x] 文件
运行代码时若出现 “PermissionError” 的提示,将浏览csv的软件关闭即可