#coding=utf-8
a = 10
b = 20
print a<b or a==b
print a<b and a==b
print not True
##################################################################################################
print str.__doc__
##################################################################################################
a = "tom"
b = "jerry"
print a+b
##################################################################################################
a = "xYz"
print a.islower()
##################################################################################################
path = "D:\\testpython.txt"
f = open(path, "w")
f.write("First line 1.\n")
f.write("First line 2.")
f.close()
f = open(path, "r")
for line in f:
print line
f.close()
##################################################################################################
#异常处理
s=raw_input("Input your age:")
if s =="":
raise Exception("Input must no be empty.")
try:
i=int(s)
except ValueError:
print "Could not convert data to an integer"
except:
print "Unknown exception"
else: # It is useful for code that must be executed if the try clause does not raise an exception
print "You are %d" % i," years old"
finally: # Clean up action
print "Goodbye!"
##################################################################################################
#类和继承
class Base:
def __init__(self):
self.data = []
def add(self, x):
self.data.append(x)
def addtwice(self, x):
self.add(x)
self.add(x)
class Child(Base):
def plus(self, a, b):
return a+b
ochild = Child()
ochild.add("str1")
print ochild.data
print ochild.plus(2,3)
##################################################################################################
import os
import os.path
# os,os.path里包含大多数文件访问的函数,所以要引入他们.
rootdir = "d:\\SiLabs"
for parent, dirnames, filenames in os.walk(rootdir):
for dirname in dirnames:
print (" parent is: " + parent)
print (" dirname is: " + dirname)
for filename in filenames:
print (" parent is: " + parent)
print (" filename is: " + filename)
''' 知识点:
* os.walk返回一个三元组.其中dirnames是所有文件夹名字(不包含路径),filenames是所有文件的名字(不包含路径).parent表示父目录.
* case1 演示了如何遍历所有目录.
* case2 演示了如何遍历所有文件.
* os.path.join(dirname,filename) : 将形如"/a/b/c"和"d.java"变成/a/b/c/d.java".
'''
##################################################################################################
import os.path
# 常用函数有三种:分隔路径,找出文件名.找出盘符(windows系统),找出文件的扩展名.
spath = "D:\Desktop\Cannon\stm401.rar"
# case 1:
p,f = os.path.split(spath)
print ( " dir is: " + p)
print ( " file is: " + f)
# case 2:
drv,left = os.path.splitdrive(spath);
print ( " driver is: " + drv)
print ( " left is: " + left)
# case 3:
f,ext = os.path.splitext(spath);
print ( " f is: " + f)
print ( " ext is: " + ext)
'''
知识点: 这三个函数都返回二元组.
* case1 分隔目录和文件名
* case2 分隔盘符和文件名
* case3 分隔文件和扩展名
'''
##################################################################################################
#coding=utf-8
import os.path
import shutil
src = "D:/test/myfile1.txt"
dst = "D:/test/myfile2.txt"
dst2 = "D:\\test\\myfile3.txt"
dst3 = "D:/test/测试文件夹.txt"
uipath = unicode(dst3 , "utf8")
dir1 = os.path.dirname(src)
print ("dir1 %s" % dir1)
if(os.path.exists(src) == False):
os.makedirs(dir1)
f1 = open (src,"w")
f1.write("line a\n")
f1.write("line b\n")
f1.close()
shutil.copyfile(src,dst)
shutil.copyfile(src,dst2)
shutil.copyfile(src,uipath)
f2 = open(dst, 'r' )
for line in f2:
print (line)
f2.close()
# 测试复制文件夹树
try :
srcDir = "D:/test"
dstDir = "D:/test2"
# 如果dstDir已经存在,那么shutil.copytree方法会报错!
# 这也意味着你不能直接用d:作为目标路径.
shutil.copytree(srcDir, dstDir)
except Exception as err:
print (err)
'''
知识点:
* shutil.copyfile:如何复制文件
* os.path.exists:如何判断文件夹是否存在
* shutil.copytree:如何复制目录树
'''
##################################################################################################
#coding=utf-8
import os.path
import shutil
import datetime
def BackUpDD():
# add dirs you want to copy
backdir = "D:\\temp"
copydirs = []
copydirs.append("D:\\test")
print (" Copying files =================== ")
# gen a data folder for backup
start = datetime.datetime.now()
backdir = os.path.join(backdir, start.strftime("%Y%m%d"))
print copydirs[0]
print backdir
try:
shutil.copytree(copydirs[0], backdir)
except Exception as err:
print (err)
end = datetime.datetime.now()
print("Finished! ===================")
print ("Elapsed time : " + str((end - start).seconds) + "seconds ")
def omitPrefix(fullpath, prefix):
#省略前缀
# Giving /media/data/programmer/project/python/tutotial/file/test.py ,
# and prefix is Giving /media/data/programmer/project/,
# return path as python/tutotial/file/test.py
print fullpath[len(prefix) + 1:]
return fullpath[len(prefix) + 1:]
#omitPrefix('D:\\temp\\123.txt', 'D:\\temp')
BackUpDD()
#元数据也复制了
##################################################################################################
#coding=utf-8
import os
copydirs = []
copydirs.append("D:\\test")
#是否为文件
print os.path.isfile(copydirs[0])
#是否是绝对路径
print os.path.isabs(copydirs[0])
print os.listdir(copydirs[0])
#执行shell
#os.system('ping www.pythontab.com')
print os.system('msconfig')#ipconfig
##################################################################################################
#coding=utf-8
import urllib2
req = urllib2.Request('http://bbs.youkuaiyun.com/callmewhy')
#HTTPError是URLError的子类 所以HTTPError在URLError前面
try:
urllib2.urlopen(req)
except urllib2.HTTPError,e:
print 'The server couldn\'t fulfill the request.'
print 'Error code: ',e.code
except urllib2.URLError, e:
print 'We failed to reach a server.'
print 'Reason: ',e.reason
##################################################################################################
#get 和 post 两种方式进行request网页
import urllib2
url= 'http://www.zhihu.com'
values = {}
values['username'] = "2826098981@qq.com"
values['password'] = "XXXXX"
data = urllib.urlencode(values)
#post
request = urllib2.Request(url,data)
#get
geturl = url + "?" + data
request =urllib2.Request(geturl)
response = urllib2.urlopen(request)
print 'url:' + url
print response.info()
##################################################################################################
#反盗链 Referer
headers = { 'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' ,
'Referer':'http://www.zhihu.com/articles' }
##################################################################################################
#抓取百度贴吧
#coding=utf-8
import string
import urllib
import urllib2
#定义百度函数
def baidu_tieba(url, begin_page, end_page):
for i in range(begin_page, end_page+1):
sName = string.zfill(i,5) + '.html'#自动填充成六位的文件名
print '正在下载第' + str(i) + '个网页,并将其存储为' + sName + '......'
f = open(sName, 'w+')
m = urllib2.urlopen(url + str(i)).read()
f.write(m)
f.close()
# -------- 在这里输入参数 ------------------
# 这个是山东大学的百度贴吧中某一个帖子的地址
# bdurl = 'http://tieba.baidu.com/p/2296017831?pn='
# iPostBegin = 1
# iPostEnd = 10
#bdurl = str(raw_input(u'请输入贴吧的地址,去掉pn=后面的数字:\n'))
#begin_page = int(raw_input(u'请输入开始的页数:\n'))
#end_page = int(raw_input(u'请输入终点的页数:\n'))
bdurl = 'https://tieba.baidu.com/p/297554321?pn='
begin_page = 1
end_page = 5
# -------- 在这里输入参数 ------------------
baidu_tieba(bdurl,begin_page,end_page)
##################################################################################################
#可以通过下面的方法把 Debug Log 打开,这样收发包的内容就会在屏幕上打印出来,方便调试,这个也不太常用
#coding=utf-8
import urllib2
httpHandler = urllib2.HTTPHandler(debuglevel=1)
httpsHandler = urllib2.HTTPSHandler(debuglevel=1)
opener = urllib2.build_opener(httpHandler, httpsHandler)
urllib2.install_opener(opener)
response = urllib2.urlopen('http://www.baidu.com')
##################################################################################################
#coding=utf-8
#爬取糗事百科
import urllib
import urllib2
import re
import thread
import time
class spiderModel:
def __init__(self):
self.page = 1
self.pages = []
self.enable = False
def GetPage(self, page):
myUrl = "http://m.qiushibaike.com/hot/page/" + page
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {'User-Agent': user_agent}
req = urllib2.Request(myUrl, headers = headers)
myResponse = urllib2.urlopen(req)
myPage = myResponse.read()
#encode的作用是将unicode编码转换成其他编码的字符串
#decode的作用是将其他编码的字符串转换成unicode编码
unicodePage = myPage.decode("utf-8")
# 找出所有class="content"的div标记
# re.S是任意匹配模式,也就是.可以匹配换行符
#pattern = re.compile(r'(?<=<div)*?(?<=>).*?(?=</div>)')
#myItems = re.search(pattern, unicodePage)
myItems = re.findall('<div.*?class="content(.*?)>(.*?)</div>', unicodePage, re.S)
#<span>(.*?)</span>
#<div.*?class"content">(.*?)</=div>,
#'<div.*?class="content".*?title="(.*?)">(.*?)</div>' 下来的是分享那个位置的
items = []
for item in myItems:
# item 中第一个是div的标题,也就是时间
# item 中第二个是div的内容,也就是内容
items.append([item[0].replace("\n", ""), item[1].replace("\n", "")])
return items
def loadPage(self):
# 如果用户未输入quit则一直运行
while self.enable:
# 如果pages数组中的内容小于2个
if len(self.pages) < 2:
try:
# 获取新的页面中的段子们
myPage = self.GetPage(str(self.page))
self.page += 1
self.pages.append(myPage)
except:
print '无法链接糗事百科!'
else:
time.sleep(1)
def ShowPage(self, nowPage, page):
you = '|||'
are = '|||'
for items in nowPage:
if items[0]!='"':
continue
print u'第%d页' % page, you, items[0], are, items[1]
myInput = raw_input()
if myInput == "quit":
self.enable = False
break
def Start(self):
self.enable = True
page = self.page
print u'正在加载中请稍候......'
# 新建一个线程在后台加载段子并存储
thread.start_new_thread(self.loadPage, ())
# ----------- 加载处理糗事百科 -----------
while self.enable:
# 如果self的page数组中存有元素
if self.pages:
nowPage = self.pages[0]
del self.pages[0]
self.ShowPage(nowPage, page)
page += 1
# ----------- 程序的入口处 -----------
print u"""
---------------------------------------
程序:糗百爬虫
版本:0.4
作者:charles
日期:2017-03-17
语言:Python 2.7
操作:输入quit退出阅读糗事百科
功能:按下回车依次浏览今日的糗百热点
---------------------------------------
"""
print u'请按下回车浏览今日的糗百内容:'
raw_input(' ')
myModel = spiderModel()
myModel.Start()
##################################################################################################
#coding=utf-8
import re
import urllib
def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html
def getImg(html):
reg = r'src="(.+?\.jpg)" pic_ext'
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
x = 0
for imgurl in imglist:
urllib.urlretrieve(imgurl,'%s.jpg' % x)
x+=1
#用到了urllib.urlretrieve()方法,直接将远程数据下载到本地。
return imglist
html = getHtml("http://tieba.com/p/2460150866")#4877448324
print getImg(html)
#reg = r'src="(.+?\.jpg)" pic_ext'
#src=" #匹配src="
#(.+?\.jpg)
# 括号表示分组,将括号的内容捕获到分组当中
# .+表示匹配至少一个任意字符,问号?表示懒惰匹配,也就是匹配尽可能少的字符串。
# .+?\.jpg合起来表示尽可能少匹配字符的匹配到.jpg,避免匹配范围超出src的范围
# 这个括号也就可以匹配网页中图片的url了
#" pic_ext #匹配" pic_ext
#print re.escape('<div><class="content"><span>A</span></div>')
#\<div\>\<class\=\"content\"\>\<span\>(.*?)(.*?)\<\/span\>\<\/div\>
#从百度贴吧下图片
##################################################################################################
#coding=utf-8
import re
import urllib
def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html
def getStat(html):
reg = r'class="wea">(.*?)</p>'
imgre = re.compile(reg)
weatherlist = re.findall(imgre,html)
return weatherlist
html = getHtml("http://www.weather.com.cn/weather/101190101.shtml")#4877448324
weatherlist = getStat(html)
for weather in weatherlist:
print weather.decode('utf-8')
#天气网查南京一周的天气状况初步
##################################################################################################
#coding=utf-8
import re
import urllib
def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html
def getStat(html):
reg = r'<span>(.*?)</span>/<i>(.*?)</i>'
#reg = r'class="wea">(.*?)</p>'
imgre = re.compile(reg)
weatherlist = re.findall(imgre,html)
return weatherlist
html = getHtml("http://www.weather.com.cn/weather/101190101.shtml")#4877448324
weatherlist = getStat(html)
for weather in weatherlist:
print '今日最高温度:'
print weather[0].decode('utf-8')
print '今日最低温度:'
print weather[1].decode('utf-8')
#天气网查南京一周的温度
##################################################################################################