# -*- coding:utf-8 -*- import requests import re import json def get_one_page(url): try: respone = requests.get(url) if respone.status_code == 200: return respone.text return None except Exception: return None def write_to_file(conetxt): with open('result.txt', 'a') as f: f.write(json.dumps(conetxt,ensure_ascii=False) + '\n') f.close() def parse_one_page(html): # print(html) pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a' +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>' +'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S) # pattern = re.compile(".*<p>(.*?)</p>.*", re.S) # items = re.findall(pattern, html) # print type(items), len(items) # print(items[0]) items = re.findall(pattern,html) # print(items[0]) for item in items: yield { "index":item[0].encode("utf-8"), "image":item[1].encode("utf-8"), "title":item[2].encode("utf-8"), "actor":item[3].strip()[3:].encode("utf-8"), "time":item[4].strip()[5:].encode("utf-8"), "socre":item[5].encode("utf-8")+item[6].encode("utf-8") } # for i in items: # for j in i: # print(j) def main(offset): url = "http://maoyan.com/board/4?offset="+ str(offset) html = get_one_page(url) for item in parse_one_page(html): print(item) write_to_file(item) if __name__ == '__main__': for i in range(10): main(i*10)
代码示例,使用2.7版本。注意字节码,在list中之输出ascii码,需要使用json格式将其输出改为utf-8
在yield追加encode编码格式