清水河畔论坛二手帖子爬虫
1 # -*- coding:utf-8 -*-
2 '''
3 清水河畔二手帖子+爬取二手交易帖子
4 '''
5 import requests
6 import json
7 from bs4 import BeautifulSoup
8 import sys
9 import urllib
10 import re
11 from urllib import request,parse
12 import pymongo
13 #by 元帅 uestc 2018.2.28
14 class QSHSpider(object):
15 # 模拟登陆清水河畔
16 def __init__(self):
17 self.headers = {
18 'username':'',
19 'password':'',
20 'Cache - Control': '',
21 'Connection': 'keep - alive',
22 'Cookie':'',
23 'Host':'bbs.uestc.edu.cn',
24 'Referer':'http: // bbs.uestc.edu.cn / member.php?mod = logging & action = login',
25 'Upgrade-Insecure - Requests': '1',
26 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
27 }
28
29 # 模拟登陆
30 def login(self):
31 request_url = "http://bbs.uestc.edu.cn/member.php?mod=logging&action=login"
32 request_new = requests.get(request_url, headers=self.headers)
33 self.login_request = request_new.text
34
35 def get_shopurl(self):
36 bf = BeautifulSoup(self.login_request,'html.parser')
37 #print(bf)
38 shop_a = bf.find_all('div',id = 'hd')[0].find_all('ul',id = 'mn_F201_menu',class_='p_pop h_pop')[0].find_all('li')[0].find_all('a')[0]
39 self.shopurl = shop_a['href']
40 print('登陆成功!')
41 print('\n')
42 print('您已进入二手帖子专题:' + self.shopurl)
43
44 def get_tieziurls(self):
45 #request_tiezi = requests.get(self.shopurl,headers = self.headers)
46 req = request.Request(url=self.shopurl,headers=self.headers, method="POST")
47 response = request.urlopen(req)
48 content = response.read()
49 res = r"<a.*?href=.*?<\/a>"
50 urls = re.findall(res, content.decode('utf-8'))
51 print('备选主题有:书籍资料;生活用品;交通工具;卡券虚拟;数码硬件;'
52 '拼单;物品租借;其他;版务/投诉;已解决;')
53 searcher = input("请输入需要查找的主题 ")
54 #获取a标签内内容
55 # res = r'<a .*?>(.*?)</a>'
56 # texts = re.findall(res, content.decode('utf-8'))
57 # for t in texts:
58 # print(t)
59 #获取a标签内超链接
60 #urls = re.findall(r'<a.*?href=.*?>\r\n(.+?)<span class="xg1 num">(.*?)</span><\/a>',re.S)
61 #bff = BeautifulSoup(request_tiezi.text,'html.parser')
62 #print(bff)
63 #shop_tc1 = bff.find_all('div',id = 'wp',class_ = 'wp')[0]
64 #.find_all('div',class_ = 'boardnav')[0].find_all('div',id = 'ct',class_ = 'wp cl')[0].find_all('div',class_ = 'mn')
65 #shop_tc1 = bff.select('.wp')
66 #print(shop_tc1)
67 #.find_all('ul',id = 'thread_types',class_ = 'ttp bm cl cttp',style = 'height: auto;')[0]
68 #.find_all('li').find_all('a')[0]
69 # shop_tiezi_url = shop_tc1['href']
70 # print(shop_tiezi_url)
71 for u in urls:
72 if searcher in str(u):
73 ui = u.replace('amp;','')
74 #print(ui)
75 urls1 = re.findall(r'<a .*?>(.*?)</a>' , ui, re.I | re.S | re.M)
76 urls2 = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')" ,ui,re.S |re.I |re.M)
77 print(urls1[0][0:8] + ' ' + urls2[0])
78 self.tieziurls = urls2[0]
79 tags = re.findall(r"\d+\.?\d*",urls2[0])
80 self.pagetag = tags[1]
81 #self.pagetag = re.sub("\D", "", urls2[0])
82 #print(self.pagetag)
83 break
84 def get_tiezi(self):
85 #req_tiezi = request.Request(url=self.tieziurls,headers=self.headers, method="POST")
86 req_tiezi = request.Request(url=self.new_url, headers=self.headers, method="POST")
87 response_tiezi = request.urlopen(req_tiezi)
88 content_tiezi = response_tiezi.read()
89 res_tiezi = r"<a.*?href=.*?<\/a>"
90 urls_tiezi = re.findall(res_tiezi, content_tiezi.decode('utf-8'))
91 for tiezi in urls_tiezi:
92 if 'class="s xst"' in tiezi:
93 tiezi = tiezi.replace('amp;', '')
94 tiezi_title = re.findall(r'<a .*?>(.*?)</a>', tiezi, re.I | re.S | re.M)
95 tiezi_urls = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')" ,tiezi,re.S |re.I |re.M)
96 print('\n\n')
97 print('帖子主题:' +tiezi_title[0] + ' ' + '帖子地址:' + tiezi_urls[0])
98 self.tiezi_url = tiezi_urls[0]
99 infor.get_details()
100 #self.tiezi_data[] = '帖子地址'
101 #数据库
102 tiezi_data = {'帖子' : {'title': tiezi_title[0], 'url' : tiezi_urls[0]}}
103 client = pymongo.MongoClient('localhost',27017)
104 mydb = client['mydb']
105 qingshuihepan = mydb['qingshuihepan']
106 qingshuihepan.insert_one(tiezi_data)
107 def get_pages(self):
108 urls_based = 'http://bbs.uestc.edu.cn/forum.php?mod=forumdisplay&fid=61&typeid={}&filter=typeid&typeid={}&page={}'
109 for i in range(1,3):
110 self.new_url = urls_based.format(self.pagetag,self.pagetag,i)
111 print('\n\n')
112 print('第'+ str(i) + '页' + ' ' '本页网址:' + self.new_url)
113 infor.get_tiezi()
114 def get_details(self):
115 #print(self.tiezi_url)
116 print('本帖详细内容:')
117 req_detail = request.Request(url=self.tiezi_url, headers=self.headers, method="POST")
118 response_detail = request.urlopen(req_detail)
119 content_detail = response_detail.read()
120 #print(content_detail.decode('utf-8'))
121 bs = BeautifulSoup(content_detail,'html.parser')
122 print(bs.find_all(class_='t_f')[0].text.strip())
123 #urls_detail = re.findall(r'<td class="t_f".*?>(.*?)</td>' , content_detail.decode('utf-8'))
124 # for ud in urls_detail:
125 # print(ud)
126
127 if "__main__" == __name__:
128 infor = QSHSpider()
129 infor.login()
130 print('登陆清水河畔ing……')
131 infor.get_shopurl()
132 infor.get_tieziurls()
133 #infor.get_tiezi()
134 infor.get_pages()
posted @ 2018-04-01 21:33 Edge_of_Eternity 阅读(...) 评论(...) 编辑 收藏