爬虫_for_python
requests安装
pip install requests
requests使用
requests.get() requests.post() requests.head() requests.delete()
例子
import requests url='http://www.baidu.com' html=requests.get(url) print html.text
python正则模块使用
findall(pattern,str,re.S) search(pattern,str,re.S) sub(pattern,str,replace)
例子
import re urls=re.findall('<a href=(.*?)>',html.text,re.S) for each in urls: print each print re.search('<a href=(.*?)>',html.text,re.S) for i in range(10): print re.sub('pn=\d','pn=%d'%i,pages)
xpath语法
//根节点 /下一层路径 [@xx=xx]特定的标签 /text()以文本返回 /@para返回参数 string(.)当前层的所有内容作为一个字符串输出 start-with(str)所有已这个str开头的标签
例子
form lxml import etree selector=etree.HTML(html.text) content=selector.XPath('//div[start-with(@id,"test")]/text()') for each in content: print each selector=etree.HTML(html.text) tmp=selector.XPath('//div[@id="class"]')[0] info=tmp.XPath('string(.)') content2=info.replace('\n','') print content2