1网页为post请求,同时记录cookie模拟登陆
class CreditJizhouSpider(Spider): name = 'credit_jizhou_spider' start_urls = [ 'http://www.tjjz.gov.cn/jx_search/search.do' ] domain = 'http://www.tjjz.gov.cn/' custom_settings = cn_punishments_settings def start_requests(self): return [Request('http://www.tjjz.gov.cn/jx_search/search.do', meta={'cookiejar': 1}, callback=self.parse)] def parse(self, response): print 'response', response.url try: for i in range(0, 100, 10): data = { 'websiteType': '0', 'startlocation': str(i), 'searchType': 'keywords', 'keyword': '行政处罚' } # Cookie = response.headers.getlist('Set-Cookie') # print Cookie print data yield FormRequest.from_response(response, url='http://www.tjjz.gov.cn/jx_search/search.do', meta={'cookiejar': response.meta['cookiejar']}, formdata=data, callback=self.parse_infourl, )2网页编码格式为gb2312, 同时xpath匹配到的url中筛选 不要http开头的
content = response.body.decode('gb2312') dl_lables = Selector(text=content).xpath('//div[@class="list_pic"]/dl')
rl = a_lable.xpath('@href').extract()[0] if not re.search(r'http://', url): url = 'http://bsq.sh.gov.cn/xxgk_website/html' + url
3 xpath 中string()与text()的区别
本质区别
text()是一个node test,而string()是一个函数,data()是一个函数且可以保留数据类型。此外,还有点号(.)表示当前节点。
使用要点
XML例子:
<book><author>Tom John</author></book>
用例 | 举例 |
---|---|
text() | book/author/text() |
string() | book/author/string() |
data() | book/author/data() |
. | book/author/. |
特殊用例
XML例子:
- 1
- 2
- 3
- 4
- 5
- 6
- 7
text()
经常在XPath表达式的最后看到text(),它仅仅返回所指元素的文本内容。
- 1
- 2
返回的结果是Tom cat,其中的John不属于author
直接的节点内容。
def parse(self, response): print 'response', response.url try: if response.xpath('string()').extract_first().strip() == '加载异常,请重新刷新页面': print 'The page is over!' else: for x in self.parse_url_data(res=response.text): yield x yield Request(self.url_pattern.format(page_num=self.count), callback=self.parse, errback=errback_httpbin) self.count += 1 except NetError as e: import os client.captureException(message=e.message, fingerprint=e.fingerprint, tags={'custom_tag': e.tags.get('custom_tag'), 'spider': __file__.split(os.sep)[-1]}) except Exception: import os import traceback traceback.print_exc() # client.captureException(tags={'spider': __file__.split(os.sep)[-1]}) def parse_url_data(self, res): try: div_lables = Selector(text=res).xpath('//div[@class="result"]')5eval函数
page_dict = eval(response.text)
1 可以把list,tuple,dict和string相互转化。
2 #################################################
3 字符串转换成列表 4 >>>a = "[[1,2], [3,4], [5,6], [7,8], [9,0]]" 5 >>>type(a) 6 <type 'str'> 7 >>> b = eval(a) 8 >>> print b 9 [[1, 2], [3, 4], [5, 6], [7, 8], [9, 0]] 10 >>> type(b) 11 <type 'list'> 12 ################################################# 13 字符串转换成字典 14 >>> a = "{1: 'a', 2: 'b'}" 15 >>> type(a) 16 <type 'str'> 17 >>> b = eval(a) 18 >>> print b 19 {1: 'a', 2: 'b'} 20 >>> type(b) 21 <type 'dict'> 22 ################################################# 23 字符串转换成元组 24 >>> a = "([1,2], [3,4], [5,6], [7,8], (9,0))" 25 >>> type(a) 26 <type 'str'> 27 >>> b = eval(a) 28 >>> print b 29 ([1, 2], [3, 4], [5, 6], [7, 8], (9, 0)) 30 >>> type(b) 31 <type 'tuple'>
6 如果在页面中存在下一页,就对页面的详情页进行解析,如果没有则反之
def parse(self, response): print 'response', response.url try: infoList = response.xpath('//*[@class="ilBox fl"]/ul/li/a') for info in infoList: href = info.xpath('@href').extract_first() if href: if u'http' in href: detail_page = href else: detail_page = ''.join(['http://www.creditcz.gov.cn:8088/czxyweb/pages', href[2:]]) print 'detail_page:', detail_page yield Request(detail_page, callback=self.parse_detail, errback=errback_httpbin) pageList = response.xpath('//*[@class="fenyeBox fl font12"]/span/a') for page in pageList: title = page.xpath('string()').extract_first() if title and u'下一页' in title: link = page.xpath('@href').extract_first() if link: next_page = ''.join(['http://www.creditcz.gov.cn:8088/czxyweb/pages/xygs', link[1:]]) print 'NextPage:', next_page yield Request(next_page, callback=self.parse, errback=errback_httpbin)
7获取总页数
text = re.search(ur'pageCount":\d+', response.text).group(0) total_page = re.search(ur'\d+', text).group(0)8post请求
def parse_url(self, response): # print response.body resultDict = json.loads(response.body) # print resultDict com_lists = resultDict["data"] for com_list in com_lists: id = com_list["ID"] seq_id = com_list["SEQ_ID"] # org = com_list["ORG"] data = { 'id': str(id), 'seqId': str(seq_id) } yield FormRequest(self.post_url, formdata=data, callback=self.parse_detail)8xpath匹配正文
if response.xpath("//p").extract(): content_list = response.xpath("//p").xpath("string(.)").extract() info_dict["content"] = ''.join(content_list)
8匹配表格并格式化输出
import re from scrapy.spiders import Spider from scrapy import Request from goldmine.config.sentry_config import client from goldmine.config.pipeline_config import cn_punishments_settings from goldmine.utils.custom_exception import NetError from goldmine.utils.httpbin_err import errback_httpbin from goldmine.utils.format_string import cn_punishments_common_item class CreditYangpuSpider(Spider): name = 'credit_yangpu_spider' start_urls = [ 'http://www.shyp.gov.cn/MHXxgk/Xxgk_Xzcf' ] url = 'http://www.shyp.gov.cn/MHXxgk/Xxgk_Xzcf?Dyc=&mid=&year=0&pageIndex={page_num}' custom_settings = cn_punishments_settings def parse(self, response): try: page_content = response.xpath('//div[@class="news_page"]/span/a') if page_content: page_num = int(page_content[-1].xpath('string()').extract_first().strip()[-3:-1]) for i in range(page_num): yield Request(self.url.format(page_num=str(i + 1)), callback=self.parse_detail, errback=errback_httpbin) except NetError as e: import os client.captureException(message=e.message, fingerprint=e.fingerprint, tags={'custom_tag': e.tags.get('custom_tag'), 'spider': __file__.split(os.sep)[-1]}) except Exception: import os import traceback traceback.print_exc() # client.captureException(tags={'spider': __file__.split(os.sep)[-1]}) def parse_detail(self, response): print 'detail', response.url try: tr_head = None tr_content = None head_content = [] index_num = [] infos = {} ###匹配表格中的所有内容 tr_lables = response.xpath('//div[@class="public_b_ul"]/table/tbody/tr') for index, tr in enumerate(tr_lables): ##枚举,index为key,tr为value if tr.xpath('./td'): if tr.xpath('./td')[0].xpath('string()').extract_first().strip() == u'序号': tr_head = tr_lables[index] #第一行为行头 tr_content = tr_lables[index + 1:] #其他行为内容 break if tr_head and tr_content: for index, td_head in enumerate(tr_head.xpath('./td')): #index为行头的key head_content.append(td_head.xpath('string()').extract_first().strip()) dic_key = self.get_table_item(td_head.xpath('string()').extract_first().strip()) if dic_key: index_num.append(index) #行头列表改为key为123value为分类名称的字典 infos[index] = dic_key for tr in tr_content: item_infos = {} content = {} temp_infos = {} flag = True if len(tr.xpath('./td')[0].xpath('string()').extract_first().strip()) == 0: break for index, td_content in enumerate(tr.xpath('./td')):#内容的kv枚举 if td_content.xpath('./div'): td_content = td_content.xpath('./div')[0] #每一条内容 content[head_content[index]] = td_content.xpath('string()').extract_first().strip() #转为 分类名称为k,内容为v的字典 if index in index_num: if infos[index] == 'legal_person' and len( td_content.xpath('string()').extract_first().strip()) == 0: flag = False temp_infos[infos[index]] = td_content.xpath('string()').extract_first().strip() if flag: item_infos.update(temp_infos) content_list = [] for key, value in content.items(): content_list.append(':'.join((key, value), )) content = '\n'.join(content_list) item_infos['content'] = content item_infos['country_code'] = 'CHN' if item_infos.get('punished_by'): text = item_infos.get('punished_by') if re.search(ur'(海)', text) and re.search(r'-', text): item_infos['penalty_decision_date'] = item_infos.get('punished_by')[-10:].strip() item_infos['punished_by'] = item_infos.get('punished_by')[:-10].strip() elif re.search(ur'海', text): item_infos['punished_by'] = item_infos.get('punished_by').strip() else: item_infos['punished_by'] = '' item_infos['penalty_decision_date'] = item_infos.get('punished_by').strip() item = cn_punishments_common_item(response, **item_infos) yield item except NetError as e: import os client.captureException(message=e.message, fingerprint=e.fingerprint, tags={'custom_tag': e.tags.get('custom_tag'), 'spider': __file__.split(os.sep)[-1]}) except Exception: import os import traceback traceback.print_exc() # client.captureException(tags={'spider': __file__.split(os.sep)[-1]}) def get_table_item(self, s): if re.search(ur'行政处罚决定书文号', s): return 'case_no' if re.search(ur'(行政处罚的履行方式和期限)', s): return 'penalty_results ' if re.search(ur'(违法企业名称)', s): return 'penalty_name' if re.search(ur'(法定代表人)', s): return 'legal_person' if re.search(ur'(主要违法事实)', s): return 'punished_reason' if re.search(ur'(行政处罚的种类和依据)', s): return 'punished_basis' if re.search(ur'(组织机构代码)', s): return 'organization_code' if re.search(ur'(作出处罚的机关名称和日期)', s): return 'punished_by' else: return None
1列表转字典
2dict的 update 与 item 用法
for key, value in content.items(): content_list.append(':'.join((key, value), ))以下实例展示了 update()函数的使用方法:
#!/usr/bin/python dict = {'Name': 'Zara', 'Age': 7} dict2 = {'Sex': 'female' } dict.update(dict2) print "Value : %s" % dict以上实例输出结果为:
Value : {'Age': 7, 'Name': 'Zara', 'Sex': 'female'}
dict = {'Google': 'www.google.com', 'Runoob': 'www.runoob.com', 'taobao': 'www.taobao.com'} print "字典值 : %s" % dict.items() # 遍历字典列表for key,values in dict.items(): print key,values
先出一个题目:1.有一 list= [1, 2, 3, 4, 5, 6]
请打印输出:
0, 1
1, 2
2, 3
3, 4
4, 5
5, 6
打印输出,
2.将 list 倒序成 [6, 5, 4, 3, 2, 1]
3.将a 中的偶数挑出 *2 ,结果为 [4, 8, 12]
这个例子用到了python中enumerate的用法。顺便说一下enumerate在for循环中得到计数的用法,enumerate参数为可遍历的变量,如 字符串,列表等; 返回值为enumerate类。
示例代码如下所示:
问题1.2.3.一同解决,代码如下:
list=[1,2,3,4,5,6]
for i ,j in enumerate(list)
print(i,j)
list2=list[::-1]
list3=[i*2 for i in list if not i%2 ]//i%2==0证明i为偶数,not 0说明为真,也就是说i为偶数的时候i*2
print(list2,list3)
>>>0,1
>>>1,2
>>>2,3
>>>3,4
>>>4,5
>>>5,6
>>>[6,5,4,3,2,1]
>>>[4,8,12]
在同时需要index和value值的时候可以使用 enumerate。下列分别将字符串,数组,列表与字典遍历序列中的元素以及它们的下标:
一,字符串:
for i,j in enumerate('abcde'):
print i,j
>>>0,a
>>>1,b
>>>2,c
>>>3,d
>>>4,e
二,数组:
for i,j in enumerate(('a','b','c')):
print i,j
输出结果为:
>>>0 a
>>>1,b
>>>2,c
三,列表:
四,字典:
for i,j in enumerate({'a':1,'b':2}):
print i,j
输出结果为:
>>>0 a
>>>1,b