Task4(2天)
**4.1 ** 实战大项目
-
实战大项目:模拟登录丁香园,并抓取论坛页面所有的人员基本信息与回复帖子内容。
-
参考资料:https://blog.youkuaiyun.com/nao77/article/details/88316754
import requests, json, re, random,time from bs4 import BeautifulSoup from selenium import webdriver from lxml import etree class getUrl(object): def __init__(self): self.headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1;Win64;x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/55.0.2883.87 Safari/537.36'} def run(self): browser = webdriver.Chrome() browser.get('https://auth.dxy.cn/accounts/login?service=http://www.dxy.cn/bbs/index.html') time.sleep(1) js1 = 'document.querySelector("#j_loginTab1").style.display="none";' browser.execute_script(js1) time.sleep(1) js2 = 'document.querySelector("#j_loginTab2").style.display="block";' browser.execute_script(js2) input_name = browser.find_element_by_name('username') input_name.clear() input_name.send_keys('1******3') input_pass = browser.find_element_by_name('password') input_pass.clear() input_pass.send_keys('w******8') browser.find_element_by_xpath('//*[@class="form__button"]/button').click() time.sleep(10) cookie = browser.get_cookies() cookie_dict = {i['name']:i['value'] for i in cookie} browser.get("http://www.dxy.cn/bbs/thread/626626#626626"); html = browser.page_source tree = etree.HTML(html) user = tree.xpath('//div[@id="postcontainer"]//div[@class="auth"]/a/text()') content = tree.xpath('//td[@class="postbody"]') for i in range(0,len(user)): result = user[i].strip()+":"+content[i].xpath('string(.)').strip() dir_file = open("DXY_records.txt",'a', encoding="utf-8") dir_file.write(result+"\n") dir_file.write('*' * 80+"\n") dir_file.close() print('success') if __name__ == '__main__': geturl = getUrl() geturl.run()