Python网络爬虫与数据处理-优快云博客

本文链接：https://blog.youkuaiyun.com/fonyer/article/details/83119297

#!/usr/bin/python
#coding:gbk
import re,urllib
import sys
from subprocess import *
import os
import time
import cPickle as pickle
import string
import random
import datetime
import urllib
from glob import glob
from music_db import *
import PIL.Image as Image
import htmlentitydefs

#过滤时间
def format_date(dt):
   dt = re.sub(r'\s','',dt)
   dt = re.sub(r'年|月','-',dt)
   dt = re.sub(r'日',' ',dt)
   return dt

#标题过滤
def title_filter(title):
   if title:
      title = re.sub("（图）|（组图）","",title)
      title = re.sub("$图$|$组图$","",title)
      title = re.sub("图|组图","",title)
      title = re.sub("：","",title)
      title = strip_code(title)
      return title

#图片过滤
def img_filter(info):
    if info:
      imgs = re.findall(r'<(img|IMG)(.*?)(src|SRC)(.*?) (.*?)>',info)
      if imgs and type(imgs) == list:
         src = imgs[0][3]
         src = re.sub("=","",src)

         src = re.sub("'","",src)
         src = re.sub('"',"",src)
         return src
#抓取
def file_contents(url):
    cmd='''curl -s "%s" --max-time 10 --retry 3 --retry-delay 5''' %(url)
    p=Popen(cmd,stdout=PIPE,stderr=PIPE,shell=True)
    data=p.communicate()[0]
    if data:
       data = re.sub(r"\n|\r","",data)
       return data

def file_contents2(url):
    wp = urllib.urlopen(url)
    data = wp.read()
    wp.close()
    if data:
      data = re.sub(r'\n|\r','',data)
      return data
      # return data

#下载
def file_download(url,file):
     cmd = '''wget -t 3 -c -T 15 "%s" -O %s ''' %(url,file)
     p = Popen(cmd,stdout=PIPE,stderr=PIPE,shell=True)
     p.wait()
     s = p.poll()
     return s

#过滤图片
def img_filter(info):
    if info:
      imgs = re.findall(r'<(img|IMG)(.*?)(src|SRC)(.*?) (.*?)>',info)
      if imgs and type(imgs) == list:
         src = imgs[0][3]
         src = re.sub("=","",src)
         src = re.sub("'","",src)
         src = re.sub('"',"",src)
         return src

#过滤特殊字符
def strip_code(s):
    if s:
       s = re.sub(r'　',' ',s)
       try:
          s = s.decode("gb18030")
       except Exception,e:
          return ""

       html_pattern = []
       html_str=htmlentitydefs.name2codepoint
       html_pattern.append({re.compile(r"'"):u"‘"})
       html_pattern.append({re.compile(r'"'):u'”'})
       html_pattern.append({re.compile(r"'"):u'’'})
       html_pattern.append({re.compile('&#(x)?([0-9a-fA-F]+);'):lambda result:unichr(int(result.group(2),result.group(1)=='x' and 16 or 10))})

       dp={"amp":u"＆","quot":u"”","ldquo":u"“","rdquo":u"”","lt":u"＜","gt":u"＞","lsquo":u"‘","rsquo":u"’"}

       for k,v in html_str.items():
         vu = dp.has_key(k) and dp[k] or unichr(v)
         html_pattern.append({re.compile(r"&%s;" %(k)):vu})

       html_pattern.append({re.compile(r'(\t){2,}'):r'\t'})
       html_pattern.append({re.compile(r'(\r){2,}'):r'\r'})
       html_pattern.append({re.compile(r'(\n){2,}'):r'\n'})
       html_pattern.append({re.compile(r'( ){2,}'):u' '})
       html_pattern.append({re.compile(r'^(\n)|(\n)$'):''})

       for i in html_pattern:
           for k,v in i.items():
               s = k.sub(v,s)

       return s.encode("utf8")

#过滤函数
def strip_tags(strs):
    if strs:
       strs = re.sub(r'<script(.*?)</script>','',strs)
       strs = re.sub(r'<SCRIPT(.*?)</SCRIPT>','',strs)
       strs = re.sub(r'<style(.*?)<\/style>','',strs)
       strs = re.sub(r'\n|\r|\t',r'\n',strs)
       strs = re.sub(r'<(br|BR)( ){0,}>',r'\n',strs)
       strs = re.sub(r'<\/(p|P)>',r'\n',strs)
       strs = re.sub(r'<.*?>','',strs)
       strs = re.sub(r'图',"",strs)
       strs = strip_code(strs)
       #strs = re.escape(strs)
       return strs

#创建一万个目录
def mdir_yiwan(path):
    if path:
       for i in range(100):
          ph1 = path+"%02d/" %i
          if not os.path.exists(ph1):
             os.makedirs(ph1)
             for j in range(100):
                 ph2 = ph1+"%02d/" %j
                 if not os.path.exists(ph2):
                    os.makedirs(ph2)

#由字母数字组成的唯一文件名
def new_file_name(path):
    str_list="%s%s" %(string.digits,string.letters)
    while True:
          suffix="".join(random.sample(str_list,6))
          fname="%s%s"        %(path,suffix)
          if os.path.exists(fname):
             continue
          else:
             return fname

#字典形式记录日志
def write_log(path,info):
      file = open(path,"w")
      pickle.dump(info,file)
      file.close()

#数据操作（标题，时间，简介，图片，类型）
def in_data(title,dates,intro,img=None,typeid=0,url=None):
title = title_filter(title)
intro = strip_tags(intro)
if title and dates and intro:
    typeid = typeid > 3 and 3 or typeid
    type_arr = ['qt','rh','om','hy']
    newid = 0
    region = type_arr[typeid]
    print url
    print title

    sql = '''select * from music_news_info where title like "%s%%" limit 1''' %(title)
    mdb = music_db()
    row = mdb.fetchone(sql)
    if not row:
       img = img_resize(img)
       today = datetime.datetime.now().strftime("%Y-%m-%d")
       logs = "%s\t%s\n" %(title,url)
       logfile = open('/www/scripts/music/zsc/news/log/%s' %(today),'a')
       logfile.write(logs)
       logfile.close()

       if img:
          title += "(图)".decode("gb18030").encode("utf8")

       print title

       objid = 0
       objtype = "none"
       sql = "select * from music_album_info order by albumname desc "#匹配专辑
       res = mdb.fetchall(sql)
       if len(res)>0:
          for v in res:
             try:
                if len(re.findall(r"%s" %(v['albumname']),title))>0:
                   objid = v['albumid']
                   objtype = "album"
                   break
             except Exception,e:
                   pass
       if objid == 0:
          sql = "select * from music_artist_info order by artistname desc"#匹配歌手
          res = mdb.fetchall(sql)
          if len(res)>0:
             for v in res:
                 try:
                    if v['artistid'] != 1022953 and len(re.findall(r"%s" %(v['artistname']),title))>0:
                       objid = v['artistid']
                       objtype = "artist"
                       break
                 except Exception,e:
                       pass


       sql = '''insert into music_news_info set title="%s", region="%s",objtype="%s",objid="%s",updatetime="%s"''' %(title,region,objtype,objid,dates)
       newid = mdb.query(sql,1)
       print newid
       if newid>0:
          newid = str(newid)
          index = newid[-1]
          newtable = "music_news_content_%s" %(index)
          sql = '''create table if not exists %s like %s; ''' %(newtable,"music_news_content")
          mdb.query(sql)
          sql = '''insert into %s set newsid="%s",content="%s" ''' %(newtable,newid,intro)
          mdb.query(sql)
          if img:
                sql = '''insert into music_news_images set newsid="%s",path="%s",serverpos="%s" ''' %(newid,img,"192.168.0.29")
                mdb.query(sql)


#缩图
#infile 远程文件路径
def img_resize(infile):
if infile:
    try:
       data = urllib.urlopen(infile).read()
    except Exception,e:
       return
    ext = re.sub(r".*\.",r"",infile)#获得文件扩展名
    t = repr(time.time())#取时间戳
    t = re.split(r'\.',t)
    rand = "".join(random.sample(string.digits,4))#生成0-9四位随机数
    today = datetime.datetime.now().strftime("%Y-%m-%d")#当天日期
    dtpath = "/www/scripts/music/zsc/news/img/"

    ndir = "%s" %(today)#存储目录若不存在创建
    if os.path.isdir(dtpath+ndir) is False:
       os.mkdir(dtpath+ndir)

    nfile2 = "%s/%s_%s_%s.%s" %(ndir,t[0],t[1][:4],rand,ext)#文件
    nfile = dtpath+nfile2
    try:
      f = file(nfile,"wb") #下载
      f.write(data)
      f.close()
    except Exception,e:
      return

    inf = "%s" %(nfile)
    img = Image.open(inf)#打开原图

    if img:
       size = img.size#图片属性
       m = 240#图片长或宽最大值
       w = size[0]
       h = size[1]
       if w>h and w>m:#宽大于高
          h = h*m/w
          w = m
       elif h>w and h>m:#高大于宽
          w = w*m/h
          h = m
       elif h==w and w>m:#
          w = m
          h = m
       nimg = img.resize((w,h))#缩图
       nimg.save(nfile)#覆盖原图
       return nfile2

常用python功能函数