- 爬去网页小说
from bs4 import BeautifulSoup import urllib.request url = "http://www.jueshitangmen.info/tian-meng-bing-can-11.html" html = urllib.request.urlopen(url).read().decode('utf-8') soup = BeautifulSoup(html,features='lxml') #爬<p>中文字 all_p =soup.find_all('p') for i in all_p: print('\n',i.get_text())
- 爬取百度图片
import urllib.request import urllib.parse import os import re #添加header,其中Referer是必须的,否则会返回403错误,User-Agent是必须的,这样才可以伪装成浏览器进行访问 header=\ { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', "referer":"https://image.baidu.com" } url = "https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord={word}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&word={word}&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&cg=girl&pn={pageNum}&rn=30&gsm=1e00000000001e&1490169411926=" keyword=input("请输入关键字") keyword =urllib.parse.quote(keyword,'utf-8')#文字转码 n = 0#页数 j = 0#图片名字 error= 0#错误 while n <3000: n+=1 #获取请求 url1=url.format(word =keyword,pageNum = str(n))#获取索要查询的关键字 rep = urllib.request.Request(url1,headers=header) rep = urllib.request.urlopen(rep ) #获取数据 try: html = rep.read().decode('utf-8') except: print("数据出错") error = 1 print("当前页是",str(n)) if error ==1: continue #利用正则取出图片网址 pattern = re.compile('thumbURL":"(.*?)"') data =re.findall(pattern,html) #设置下载位置 if os.path.isdir("D://pictures/图片")!=True: os.makedirs(r"D://pictures/图片") #下载 for i in data: print(i) urllib.request.urlretrieve(i,"D://pictures/图片/pic{num}.jpg".format(num =j)) j+=1 print("总爬取图片数"+str(j))
- 爬取猫眼电影Top100相关信息
import urllib.request import urllib.parse import re import os import json from multiprocessing import Pool import re header=\ { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', "referer":"https://image.baidu.com" } #获取网页地址 def get_one_page(url): rep = urllib.request.Request(url, headers=header) rep = urllib.request.urlopen(rep) s = rep.read().decode("utf-8") return s #利用正则找到需要的资源 def parse_one_page(html): pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a.*?>(.*?)' + '</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?' + '.*?fraction">(.*?)</i>.*?</dd>', re.S) items = re.findall(pattern, html) for item in items: yield { 'index': item[0], 'image': item[1], 'title': item[2], 'actor': item[3].strip()[3:], 'time': item[4].strip()[5:], 'score': item[5]+item[6] } #写入文件 def writedata(data): with open("maoyan.txt",'a',encoding='utf-8')as f: f.write(json.dumps(data,ensure_ascii=False)+'\n') def main(num): url = 'http://maoyan.com/board/4?offset='+str(num) html = get_one_page(url) parse_one_page(html) for item in parse_one_page(html): print(item) writedata(item) if __name__ == '__main__': #创建线程池 pool = Pool() pool.map(main,[i*10 for i in range(10)])