#coding=utf-8
import urllib2
import httplib
import re
from pybloomfilter import BloomFilter
import StringIO
import os
import gzip
import zlib
import lxml
from lxml import html
from lxml import etree
from bs4 import BeautifulSoup
request_headers = {
'Accept':"image/webp,image/apng,image/*,*/*;q=0.8",
'Accept-Encoding':"gzip, deflate",
'Accept-Language':"zh-CN,zh;q=0.8",
'Connection':"keep-alive",
'Referer':"http://3g.163.com/touch/local?dataversion=A&uversion=A&version=v_standard",
'User-Agent':"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Mobile Safari/537.36"
}
# 创建 Bloom Filter
download_bf = BloomFilter(1024 * 1024 * 16, 0.01)
url = 'http://3g.163.com/touch/local?dataversion=A&uversion=A&version=v_standard'
req = urllib2.Request(url, headers=request_headers)
response = urllib2.urlopen(req)
htmlcontent = response.read()
#如果是gzip解码的话,怕出现乱码,要用下面三行
gzipped = response.headers.get('Content-Encoding')
if gzipped:
htmlcontent = zlib.decompress(htmlcontent, 16+zlib.MAX_WBITS)
print htmlcontent
soup = BeautifulSoup(htmlcontent, 'lxml')
urls=[]
news_content=[]
# a=soup.select('div.cm_news_main > ul.cm_ul_round > li > a ')
# print a
# ul_contents=soup.select('ul[class="cm_ul_round ul_page1"] > li > a')
# print ul_contents
# for link in soup.select('div.aslide > a'):
#
# urls.append(link.get('href'))
# news_content.append(link.text)
# print urls
#
#
# for i in news_content:
# print i
# print len(news_content)
for link in soup.select('div.ndi_main > h3 > a'):
urls.append(link.get('href'))
news_content.append(link.text)
print urls
print len(news_content)
爬非动态的网页的模板
最新推荐文章于 2024-12-09 15:15:51 发布