ubuntu下安装Python scrapy
1.ubuntu已自带了python
2.安装scrapy
先安装pip
apt-get install python-pip
再安装scrapy
pip install scrapy
但是pypi源一直访问不了,需要指定下面的源
http://pypi.douban.com/simple/
http://pypi.v2ex.com/simple/
sudo pip install -i http://pypi.douban.com/simple/ scrapy
win7 64 安装 easy_install
执行下面脚本
http://peak.telecommunity.com/dist/ez_setup.py
会把easy_install.exe下载到C:\Python27\Scripts
把C:\Python27\Scripts加入环境变量PATH
安装pip
easy_install pip
会把pip.exe下载到C:\Python27\Scripts
pip install scrapy
会把scrapy安装到 C:\Python27\Lib\site-packages
http://blog.youkuaiyun.com/dreamzml/article/details/8847879
http://jingyan.baidu.com/article/e73e26c0d94e0524adb6a7ff.html
1.
https://www.python.org/downloads/release/python-343/
https://www.python.org/download/releases/2.7.7/
python 3.4
import sys
sys.path.append('c:/')#导入path路径
import hello#导入hello.py ,hello.py只有一行,内容是:x=1.2
print(hello.x)
>>> dir(hello)
['__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'x']
>>> print(hello.__file__)
C:\Python34\hello.py
>>> print(hello.__name__)
hello
python输出列表或字典时,如果是中文则显示utf8的编码字符,如下例
['\xe4\xbd\xa0\xe5\xa5\xbdabc', '\xe9\x82\xa3\xe4\xbd\xa0abc']
你好abc
{'cc': '\xe9\x82\xa3\xe4\xbd\xa0abc', 'egg': '\xe4\xbd\xa0\xe5\xa5\xbdabc'}
你好abc
2.
正则表达式
懒惰模式
a.*?b
匹配最短的,以a开始,以b结束的字符串
.*加上一个? 就意味着匹配任意数量的重复,但是在能使整个匹配成功的前提下使用最少的重复
汇通网所有新闻标题
先把<div class="touzi_font">的标签内容全部匹配出来
再处理该div内部的各个小标签
3.使用快捷键ctrl+alt+s 打开 setting,定位到editor->fileencoding
把project encoding设置为utf-8
用requests获取gbk2312编码的网站
1.使用requests的content,而不是text
把project encoding设置为GBK
有的时候通过360浏览器的右键copy xpath,得到的xpath与填入到selector.xpath的参数不太一样。比如div的编号可能由于动态变化而导致不一样。
遇到解不到的情况下,调试方法是从头到尾排查,看是哪个节点编号有错位
content = selector.xpath('/html/body/*') #获取body节点的所有子节点
print(content)
content = selector.xpath('/html/body/@*') #获取body节点的所有属性
print(content)
content = selector.xpath('/html/body/div[1]/*') #获取div[1]节点的所有属性
print(content)
content = selector.xpath('/html/body/div[1]/@*') #获取div[1]节点的所有属性
print(content)
content = selector.xpath('/html/body/div[4]/div[1]/div[2]/div[5]/table/tbody/tr[6]/td/div[1]')
print(content)
for each in content:
innerhtml = etree.tostring(each) #将一个div节点的所有html内容转化为字符串,赋给innerhtml
print innerhtml
content = selector.xpath('/html/body/div[4]/div[1]/div[2]/div[5]/table/tbody/tr[6]/td/div[1]/text()')
print(content)
for each in content:
innerhtml = each#将一个div节点的所有text(不包括子节点)赋给innerhtml
print innerhtml
content = selector.xpath('/html/body/div[4]/div[1]/div[2]/div[5]/table/tbody/tr[6]/td/div[1]')
for each in content:
siteurl = each.xpath('string(.)') #提取当前div节点下和所有子节点孙节点..的所有text,赋给siteurl
print siteurl
安装lxml
pip install lxml
下载图片
python mysql
apt-get install python-mysqldb
1.ubuntu已自带了python
2.安装scrapy
先安装pip
apt-get install python-pip
再安装scrapy
pip install scrapy
但是pypi源一直访问不了,需要指定下面的源
http://pypi.douban.com/simple/
http://pypi.v2ex.com/simple/
sudo pip install -i http://pypi.douban.com/simple/ scrapy
win7 64 安装 easy_install
执行下面脚本
http://peak.telecommunity.com/dist/ez_setup.py
会把easy_install.exe下载到C:\Python27\Scripts
把C:\Python27\Scripts加入环境变量PATH
安装pip
easy_install pip
会把pip.exe下载到C:\Python27\Scripts
不过,安装这个版本的python的时候,这两个都已经自动给装上了
pip install scrapy
会把scrapy安装到 C:\Python27\Lib\site-packages
http://blog.youkuaiyun.com/dreamzml/article/details/8847879
http://jingyan.baidu.com/article/e73e26c0d94e0524adb6a7ff.html
1.
https://www.python.org/downloads/release/python-343/
https://www.python.org/download/releases/2.7.7/
python 3.4
import sys
sys.path.append('c:/')#导入path路径
import hello#导入hello.py ,hello.py只有一行,内容是:x=1.2
print(hello.x)
>>> dir(hello)
['__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'x']
>>> print(hello.__file__)
C:\Python34\hello.py
>>> print(hello.__name__)
hello
python输出列表或字典时,如果是中文则显示utf8的编码字符,如下例
# -*- coding: utf-8 -*-
list1=['你好abc','那你abc']
print list1
print list1[0]
dic1={'egg':'你好abc','cc':'那你abc'}
print dic1
print dic1['egg']
['\xe4\xbd\xa0\xe5\xa5\xbdabc', '\xe9\x82\xa3\xe4\xbd\xa0abc']
你好abc
{'cc': '\xe9\x82\xa3\xe4\xbd\xa0abc', 'egg': '\xe4\xbd\xa0\xe5\xa5\xbdabc'}
你好abc
2.
import urllib
content = urllib.urlopen('http://www.baidu.com').read()
print(content)
import urllib2
content = urllib2.urlopen('http://www.baidu.com').read()
print(content)
import urllib2
req = urllib2.Request('http://www.baidu.com')
content = urllib2.urlopen(req).read()
print content
import urllib2
req = urllib2.Request('http://www.baidu.com/')
req.add_header('User-Agent', 'fake-client')
res = urllib2.urlopen(req)
print res.read()
import urllib
import urllib2
url = 'http://www.baidu.com'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
values = {'name' : 'WHY',
'location' : 'SDU',
'language' : 'Python' }
data = urllib.urlencode(values)
req = urllib2.Request(url, data, headers)
res = urllib2.urlopen(req)
print res.headers.items()
import urllib2
req = urllib2.Request('http://www.baidu.com')
res = urllib2.urlopen(req)
item = res.headers.items()
print item
正则表达式
# -*- coding: utf-8 -*-
import re
match = re.search(r' world' , 'hello world!')
print match.group(0)
print match.group(1)
print match.group(2)
<pre style="font-family: 宋体; font-size: 9pt; background-color: rgb(255, 255, 255);">
贪婪模式a.*b. 匹配除了换行符以外的任意字符* 表示有0个或多个前面的那个字符 ,而这个是字符却是点,代表不必是都相同的字符 所以组合起来就是匹配最长的以a开始,以b结束的字符串
import re
x='aak7bka7kb'
y=re.findall('a.*b',x)
print y
['7bka7k']
懒惰模式
a.*?b
匹配最短的,以a开始,以b结束的字符串
.*加上一个? 就意味着匹配任意数量的重复,但是在能使整个匹配成功的前提下使用最少的重复
__author__ = 'songl'
import re
x='aak7bka7kb'
y=re.findall('a.*?b',x)
print y
['aak7b', 'a7kb']
http://deerchao.net/tutorials/regex/regex.htm
使用requests第三方库带台urllib,urllib2
#-*—coding:utf8-*-
import requests
import re
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
hea = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'}
html = requests.get('http://news.fx678.com/news/top/index.shtml',headers = hea)
html.encoding = 'utf-8' #这一行是将编码转为utf-8否则中文会显示乱码。
chinese = re.findall(r'class="touzi_font"(.*?)</div>',html.text,re.S)
for each in chinese:
hhh = re.findall(r'target=\"_blank\">(.*?)</a></h1>',each,re.S)
print hhh[0]
汇通网所有新闻标题
先把<div class="touzi_font">的标签内容全部匹配出来
再处理该div内部的各个小标签
#-*—coding:utf8-*-
import requests
import re
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
def findweb(url):
hea = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'}
html = requests.get(url,headers = hea)
html.encoding = 'utf-8' #这一行是将编码转为utf-8否则中文会显示乱码。
chinese = re.findall('class="touzi_font"(.*?)</div>',html.text,re.S)
for each in chinese:
hhh = re.findall('target="_blank">(.*?)</a></h1>',each,re.S)
print hhh[0]
url ='http://news.fx678.com/news/top/index.shtml'
findweb(url)
for i in range(1,11):
new_url = re.sub('index','index%d'%i,url,re.S)
findweb(new_url)
<div class="touzi_font">
<h1><a href="/C/20151017/201510171648121936.shtml" οnclick="add_click('201510171648121936')" target="_blank">周评:核心CPI助美元重夺失地,下周关注中国GDP及欧银...</a></h1>
<p><a href="/C/20151017/201510171648121936.shtml" οnclick="add_click('201510171648121936')" target="_blank">美元指数本周下跌0.16%,收报94.73,一度触及七周低位93.80,因因美联储或进一步推迟升息忧虑加重,而本周稍晚公布的美国核心CPI上升帮助美元挽回了大部分跌势。</a></p>
</div>
<div class="clear"></div>
</li>
<li>
<div class="new_6_rt"><span>文/</span><a href="/news/Editor/193/index.shtml">飞鱼</a></div>
<div class="clock_touzi">2015年10月17日 16:16:41 </div>
<div class="clear"></div>
<div class="new_6_pic">
<a href="/C/20151017/201510171616241935.shtml" οnclick="add_click('201510171616241935')" target="_blank"><img src=http://upload.fx678.com/upload/ht/20151017/sl_2015101716135261.jpg width="145px" height="110px" /></a>
</div>
<div class="touzi_font">
<h1><a href="/C/20151017/201510171616241935.shtml" οnclick="add_click('201510171616241935')" target="_blank">周评:推迟升息忧虑发酵,黄金涨近2%触及四个月高位</a></h1>
<p><a href="/C/20151017/201510171616241935.shtml" οnclick="add_click('201510171616241935')" target="_blank">现货黄金本周(10月12日至10月16日)大涨1.81%,收报1176.97美元/盎司,周四(10月15日)一度触及四个月高位1191.48美元/盎司,因美联储或进一步推迟升息的忧虑大大鼓舞了黄金多头。不过本周稍晚公布的美国核心CPI上升令黄金回吐了...</a></p>
</div>
<div class="clear"></div>
</li>
<li>
<div class="new_6_rt"><span>文/</span><a href="/news/Editor/185/index.shtml">龙舞</a></div>
<div class="clock_touzi">2015年10月17日 07:28:14 </div>
<div class="clear"></div>
<div class="new_6_pic">
<a href="/C/20151017/201510170728141851.shtml" οnclick="add_click('201510170728141851')" target="_blank"><img src=http://upload.fx678.com/upload/ht/20151017/sl_2015101707194377.jpg width="145px" height="110px" /></a>
</div>
<div class="touzi_font">
<h1><a href="/C/20151017/201510170728141851.shtml" οnclick="add_click('201510170728141851')" target="_blank">美元油价宁死不屈,下周OPEC再掀腥风血雨</a></h1>
<p><a href="/C/20151017/201510170728141851.shtml" οnclick="add_click('201510170728141851')" target="_blank">10月16日美国经济数据好坏参半,美元仍然在升息预期和风险偏好的助推下小升,在近来疲软数据和中国放缓忧虑的施压下连续两日反弹。原油价格也录得良好表现,空头回补和钻井平台数据助推油价上涨2%。下周原油市场的焦点将转向OP...</a></p>
</div>
获取极客学院的课程表
#-*—coding:utf8-*-
import requests
import re
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
def findweb(url):
hea = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'}
html = requests.get(url,headers = hea)
html.encoding = 'utf-8' #这一行是将编码转为utf-8否则中文会显示乱码。
chinese = re.findall('(<li id="\d*" test="0" deg="0" >.*?</li>)',html.text,re.S)
for each in chinese:
# print each
learn = re.search('<em class="learn-number">(.*?)</em>',each,re.S).group(1)
jibie = re.search('<i class="xinhao-icon\d*"></i><em>(.*?)</em>',each,re.S).group(1)
keshi = re.search('<dd class="mar-b8"><i class="time-icon"></i><em>(.*?)</em>',each,re.S).group(1)
title = re.search('class="lessonimg" title="(.*?)>',each,re.S).group(1)
print title
print learn
print jibie
print keshi
print '\n'
url ='http://www.jikexueyuan.com/course/?pageNum=1'
findweb(url)
用requests获取utf-8编码的网站
ide是pycharm
1.需要加上设置utf-8的头
#-*—coding:utf8-*-
import requests
import re
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
hea = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'}
html = requests.get('http://www.baidu.com',headers = hea)
html.encoding = 'utf-8'
print html.text
2..py页面右下角选择utf-8
3.使用快捷键ctrl+alt+s 打开 setting,定位到editor->fileencoding
把project encoding设置为utf-8
用requests获取gbk2312编码的网站
1.使用requests的content,而不是text
import requests
import re
hea = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'}
html = requests.get('http://cl.loei.pw/thread0806.php?fid=21',headers = hea)
print html.content
2.使用快捷键ctrl+alt+s 打开 setting,定位到editor->fileencoding
把project encoding设置为GBK
xpath的使用
// 定位根节点
/ 往下层寻找
/text() 提取文本内容
/@xxx 提取属性内容
[@id="xx"] 过滤只留下id是xx的
[@class="xx"] 过滤只留下class是xx的
[@yy="xx"] 过滤只留下yy是xx的
几个例子
所有元素下的div标签下的ul下的li的text属性
content = selector.xpath('//*/div/ul/li/text()')
所有元素的text属性
content = selector.xpath('//*/text()')
id是useful的元素下的所有li的text属性
content = selector.xpath('//*[@id="useful"]/li/text()')
所有li元素的text属性
content = selector.xpath('//li/text()')
下面三个相同content = selector.xpath('//*/body/text()')content = selector.xpath('//body/text()')content = selector.xpath('/html/body/text()')
如果跟一个/,就必须是根节点才行,否则需要两个/
有的时候通过360浏览器的右键copy xpath,得到的xpath与填入到selector.xpath的参数不太一样。比如div的编号可能由于动态变化而导致不一样。
遇到解不到的情况下,调试方法是从头到尾排查,看是哪个节点编号有错位
content = selector.xpath('/html/body/*') #获取body节点的所有子节点
print(content)
content = selector.xpath('/html/body/@*') #获取body节点的所有属性
print(content)
content = selector.xpath('/html/body/div[1]/*') #获取div[1]节点的所有属性
print(content)
content = selector.xpath('/html/body/div[1]/@*') #获取div[1]节点的所有属性
print(content)
content = selector.xpath('/html/body/div[4]/div[1]/div[2]/div[5]/table/tbody/tr[6]/td/div[1]')
print(content)
for each in content:
innerhtml = etree.tostring(each) #将一个div节点的所有html内容转化为字符串,赋给innerhtml
print innerhtml
content = selector.xpath('/html/body/div[4]/div[1]/div[2]/div[5]/table/tbody/tr[6]/td/div[1]/text()')
print(content)
for each in content:
innerhtml = each#将一个div节点的所有text(不包括子节点)赋给innerhtml
print innerhtml
content = selector.xpath('/html/body/div[4]/div[1]/div[2]/div[5]/table/tbody/tr[6]/td/div[1]')
for each in content:
siteurl = each.xpath('string(.)') #提取当前div节点下和所有子节点孙节点..的所有text,赋给siteurl
print siteurl
安装lxml
pip install lxml
#-*-coding:utf8-*-
from lxml import etree
html = '''
<!DOCTYPE html>
<html>
<head lang="en">
<meta charset="UTF-8">
<title>测试-常规用法</title>
</head>
<body>
<div id="content">
<ul id="useful">
<li>这是第一条信息</li>
<li>这是第二条信息</li>
<li>这是第三条信息</li>
</ul>
<ul id="useless">
<li>不需要的信息1</li>
<li>不需要的信息2</li>
<li>不需要的信息3</li>
</ul>
<div id="url">
<a href="http://jikexueyuan.com">极客学院</a>
<a href="http://jikexueyuan.com/course/" title="极客学院课程库">点我打开课程库</a>
</div>
</div>
</body>
</html>
'''
selector = etree.HTML(html)
#提取文本
content = selector.xpath('//ul[@id="useful"]/li/text()')
for each in content:
print each
#提取属性
link = selector.xpath('//a/@href')
for each in link:
print each
title = selector.xpath('//a/@title')
print title[0]
使用xpath获取汇通网新闻首页新闻标题
#-*—coding:utf8-*-
import requests
from lxml import etree
import re
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
def findweb(url):
hea = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'}
html = requests.get(url,headers = hea)
html.encoding = 'utf-8' #这一行是将编码转为utf-8否则中文会显示乱码。
selector = etree.HTML(html.text)
#提取文本
content = selector.xpath('//*[@id="analysis_ul"]/li/div[5]/h1/a/text()')
for each in content:
print each
url ='http://news.fx678.com/news/top/index.shtml'
findweb(url)
使用下一页的链接获取汇通网所有新闻标题
#-*—coding:utf8-*-
import requests
from lxml import etree
import re
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
def findweb(url):
hea = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'}
html = requests.get(url,headers = hea)
html.encoding = 'utf-8' #这一行是将编码转为utf-8否则中文会显示乱码。
selector = etree.HTML(html.text)
#提取文本
content = selector.xpath('//*[@id="analysis_ul"]/li/div[5]/h1/a/text()')
for each in content:
print each
nextlink = selector.xpath('//div[@class="hc_content"]/div[@class="analysis_inter_left"]/div[@class="hc_new_6"]/div[@class="my_pg"]/a[@class="nxt"]/@href')
print nextlink
if nextlink:
nextlink = nextlink[0]
print nextlink
url = 'http://news.fx678.com/news/top/' + nextlink
findweb(url)
url ='http://news.fx678.com/news/top/index.shtml'
findweb(url)
下载图片
#-*-coding:utf8-*-
import re
import requests
pic_url =['http://pic.meizitu.com/wp-content/uploads/2015a/10/27/01.jpg',
'http://pic.meizitu.com/wp-content/uploads/2015a/08/11/01.jpg']
i = 0
for each in pic_url:
print 'now downloading:' + each
pic = requests.get(each)
fp = open('pic\\' + str(i) + '.jpg','wb')
fp.write(pic.content)
fp.close()
i += 1
python mysql
apt-get install python-mysqldb
# -*- coding: utf-8 -*-
from MySQLdb import *
def conn():
cn=Connection('192.168.1.100','root','','meizi1688')
#Connection()函数的参数依次为
# host(string, host to connect);
# user(string, user to connect as);
# passwd(string, password to use);
# db(string, database to use)
#也可以这样选择数据库
#cn.select_db('test')
cur=cn.cursor()
cur.execute('select * from mz_url')
#设置游标的位置,不设置默认为0
#cur.scroll(0)
row=cur.fetchall()
#查询游标位置的一条记录,返回值为元组
print row
#写入
sql = "insert into mz_url(name,url) values(%s,%s)"
param = ("aaa",'bb')
n = cur.execute(sql,param)
print 'insert',n
#提交
cn.commit()
#关闭
cn.close()
if __name__=='__main__':
conn()