#!/usr/bin/python
#coding:gbk
import re,urllib
import sys
from subprocess import *
import os
import time
import cPickle as pickle
import string
import random
import datetime
import urllib
from glob import glob
from music_db import *
import PIL.Image as Image
import htmlentitydefs
#过滤时间
def format_date(dt):
dt = re.sub(r'\s','',dt)
dt = re.sub(r'年|月','-',dt)
dt = re.sub(r'日',' ',dt)
return dt
#标题过滤
def title_filter(title):
if title:
title = re.sub("(图)|(组图)","",title)
title = re.sub("\(图\)|\(组图\)","",title)
title = re.sub("图|组图","",title)
title = re.sub(":","",title)
title = strip_code(title)
return title
#图片过滤
def img_filter(info):
if info:
imgs = re.findall(r'<(img|IMG)(.*?)(src|SRC)(.*?) (.*?)>',info)
if imgs and type(imgs) == list:
src = imgs[0][3]
src = re.sub("=","",src)
src = re.sub("'","",src)
src = re.sub('"',"",src)
return src
#抓取
def file_contents(url):
cmd='''curl -s "%s" --max-time 10 --retry 3 --retry-delay 5''' %(url)
p=Popen(cmd,stdout=PIPE,stderr=PIPE,shell=True)
data=p.communicate()[0]
if data:
data = re.sub(r"\n|\r","",data)
return data
def file_contents2(url):
wp = urllib.urlopen(url)
data = wp.read()
wp.close()
if data:
data = re.sub(r'\n|\r','',data)
return data
# return data
#下载
def file_download(url,file):
cmd = '''wget -t 3 -c -T 15 "%s" -O %s ''' %(url,file)
p = Popen(cmd,stdout=PIPE,stderr=PIPE,shell=True)
p.wait()
s = p.poll()
return s
#过滤图片
def img_filter(info):
if info:
imgs = re.findall(r'<(img|IMG)(.*?)(src|SRC)(.*?) (.*?)>',info)
if imgs and type(imgs) == list:
src = imgs[0][3]
src = re.sub("=","",src)
src = re.sub("'","",src)
src = re.sub('"',"",src)
return src
#过滤特殊字符
def strip_code(s):
if s:
s = re.sub(r' ',' ',s)
try:
s = s.decode("gb18030")
except Exception,e:
return ""
html_pattern = []
html_str=htmlentitydefs.name2codepoint
html_pattern.append({re.compile(r"'"):u"‘"})
html_pattern.append({re.compile(r'"'):u'”'})
html_pattern.append({re.compile(r"'"):u'’'})
html_pattern.append({re.compile('&#(x)?([0-9a-fA-F]+);'):lambda result:unichr(int(result.group(2),result.group(1)=='x' and 16 or 10))})
dp={"amp":u"&","quot":u"”","ldquo":u"“","rdquo":u"”","lt":u"<","gt":u">","lsquo":u"‘","rsquo":u"’"}
for k,v in html_str.items():
vu = dp.has_key(k) and dp[k] or unichr(v)
html_pattern.append({re.compile(r"&%s;" %(k)):vu})
html_pattern.append({re.compile(r'(\t){2,}'):r'\t'})
html_pattern.append({re.compile(r'(\r){2,}'):r'\r'})
html_pattern.append({re.compile(r'(\n){2,}'):r'\n'})
html_pattern.append({re.compile(r'( ){2,}'):u' '})
html_pattern.append({re.compile(r'^(\n)|(\n)$'):''})
for i in html_pattern:
for k,v in i.items():
s = k.sub(v,s)
return s.encode("utf8")
#过滤函数
def strip_tags(strs):
if strs:
strs = re.sub(r'<script(.*?)</script>','',strs)
strs = re.sub(r'<SCRIPT(.*?)</SCRIPT>','',strs)
strs = re.sub(r'<style(.*?)<\/style>','',strs)
strs = re.sub(r'\n|\r|\t',r'\n',strs)
strs = re.sub(r'<(br|BR)( ){0,}>',r'\n',strs)
strs = re.sub(r'<\/(p|P)>',r'\n',strs)
strs = re.sub(r'<.*?>','',strs)
strs = re.sub(r'图',"",strs)
strs = strip_code(strs)
#strs = re.escape(strs)
return strs
#创建一万个目录
def mdir_yiwan(path):
if path:
for i in range(100):
ph1 = path+"%02d/" %i
if not os.path.exists(ph1):
os.makedirs(ph1)
for j in range(100):
ph2 = ph1+"%02d/" %j
if not os.path.exists(ph2):
os.makedirs(ph2)
#由字母数字组成的唯一文件名
def new_file_name(path):
str_list="%s%s" %(string.digits,string.letters)
while True:
suffix="".join(random.sample(str_list,6))
fname="%s%s" %(path,suffix)
if os.path.exists(fname):
continue
else:
return fname
#字典形式记录日志
def write_log(path,info):
file = open(path,"w")
pickle.dump(info,file)
file.close()
#数据操作 (标题,时间,简介,图片,类型)
def in_data(title,dates,intro,img=None,typeid=0,url=None):
title = title_filter(title)
intro = strip_tags(intro)
if title and dates and intro:
typeid = typeid > 3 and 3 or typeid
type_arr = ['qt','rh','om','hy']
newid = 0
region = type_arr[typeid]
print url
print title
sql = '''select * from music_news_info where title like "%s%%" limit 1''' %(title)
mdb = music_db()
row = mdb.fetchone(sql)
if not row:
img = img_resize(img)
today = datetime.datetime.now().strftime("%Y-%m-%d")
logs = "%s\t%s\n" %(title,url)
logfile = open('/www/scripts/music/zsc/news/log/%s' %(today),'a')
logfile.write(logs)
logfile.close()
if img:
title += "(图)".decode("gb18030").encode("utf8")
print title
objid = 0
objtype = "none"
sql = "select * from music_album_info order by albumname desc "#匹配专辑
res = mdb.fetchall(sql)
if len(res)>0:
for v in res:
try:
if len(re.findall(r"%s" %(v['albumname']),title))>0:
objid = v['albumid']
objtype = "album"
break
except Exception,e:
pass
if objid == 0:
sql = "select * from music_artist_info order by artistname desc"#匹配歌手
res = mdb.fetchall(sql)
if len(res)>0:
for v in res:
try:
if v['artistid'] != 1022953 and len(re.findall(r"%s" %(v['artistname']),title))>0:
objid = v['artistid']
objtype = "artist"
break
except Exception,e:
pass
sql = '''insert into music_news_info set title="%s", region="%s",objtype="%s",objid="%s",updatetime="%s"''' %(title,region,objtype,objid,dates)
newid = mdb.query(sql,1)
print newid
if newid>0:
newid = str(newid)
index = newid[-1]
newtable = "music_news_content_%s" %(index)
sql = '''create table if not exists %s like %s; ''' %(newtable,"music_news_content")
mdb.query(sql)
sql = '''insert into %s set newsid="%s",content="%s" ''' %(newtable,newid,intro)
mdb.query(sql)
if img:
sql = '''insert into music_news_images set newsid="%s",path="%s",serverpos="%s" ''' %(newid,img,"192.168.0.29")
mdb.query(sql)
#缩图
#infile 远程文件路径
def img_resize(infile):
if infile:
try:
data = urllib.urlopen(infile).read()
except Exception,e:
return
ext = re.sub(r".*\.",r"",infile)#获得文件扩展名
t = repr(time.time())#取时间戳
t = re.split(r'\.',t)
rand = "".join(random.sample(string.digits,4))#生成0-9四位随机数
today = datetime.datetime.now().strftime("%Y-%m-%d")#当天日期
dtpath = "/www/scripts/music/zsc/news/img/"
ndir = "%s" %(today)#存储目录 若不存在创建
if os.path.isdir(dtpath+ndir) is False:
os.mkdir(dtpath+ndir)
nfile2 = "%s/%s_%s_%s.%s" %(ndir,t[0],t[1][:4],rand,ext)#文件
nfile = dtpath+nfile2
try:
f = file(nfile,"wb") #下载
f.write(data)
f.close()
except Exception,e:
return
inf = "%s" %(nfile)
img = Image.open(inf)#打开原图
if img:
size = img.size#图片属性
m = 240#图片长或宽最大值
w = size[0]
h = size[1]
if w>h and w>m:#宽大于高
h = h*m/w
w = m
elif h>w and h>m:#高大于宽
w = w*m/h
h = m
elif h==w and w>m:#
w = m
h = m
nimg = img.resize((w,h))#缩图
nimg.save(nfile)#覆盖原图
return nfile2
#coding:gbk
import re,urllib
import sys
from subprocess import *
import os
import time
import cPickle as pickle
import string
import random
import datetime
import urllib
from glob import glob
from music_db import *
import PIL.Image as Image
import htmlentitydefs
#过滤时间
def format_date(dt):
dt = re.sub(r'\s','',dt)
dt = re.sub(r'年|月','-',dt)
dt = re.sub(r'日',' ',dt)
return dt
#标题过滤
def title_filter(title):
if title:
title = re.sub("(图)|(组图)","",title)
title = re.sub("\(图\)|\(组图\)","",title)
title = re.sub("图|组图","",title)
title = re.sub(":","",title)
title = strip_code(title)
return title
#图片过滤
def img_filter(info):
if info:
imgs = re.findall(r'<(img|IMG)(.*?)(src|SRC)(.*?) (.*?)>',info)
if imgs and type(imgs) == list:
src = imgs[0][3]
src = re.sub("=","",src)
src = re.sub("'","",src)
src = re.sub('"',"",src)
return src
#抓取
def file_contents(url):
cmd='''curl -s "%s" --max-time 10 --retry 3 --retry-delay 5''' %(url)
p=Popen(cmd,stdout=PIPE,stderr=PIPE,shell=True)
data=p.communicate()[0]
if data:
data = re.sub(r"\n|\r","",data)
return data
def file_contents2(url):
wp = urllib.urlopen(url)
data = wp.read()
wp.close()
if data:
data = re.sub(r'\n|\r','',data)
return data
# return data
#下载
def file_download(url,file):
cmd = '''wget -t 3 -c -T 15 "%s" -O %s ''' %(url,file)
p = Popen(cmd,stdout=PIPE,stderr=PIPE,shell=True)
p.wait()
s = p.poll()
return s
#过滤图片
def img_filter(info):
if info:
imgs = re.findall(r'<(img|IMG)(.*?)(src|SRC)(.*?) (.*?)>',info)
if imgs and type(imgs) == list:
src = imgs[0][3]
src = re.sub("=","",src)
src = re.sub("'","",src)
src = re.sub('"',"",src)
return src
#过滤特殊字符
def strip_code(s):
if s:
s = re.sub(r' ',' ',s)
try:
s = s.decode("gb18030")
except Exception,e:
return ""
html_pattern = []
html_str=htmlentitydefs.name2codepoint
html_pattern.append({re.compile(r"'"):u"‘"})
html_pattern.append({re.compile(r'"'):u'”'})
html_pattern.append({re.compile(r"'"):u'’'})
html_pattern.append({re.compile('&#(x)?([0-9a-fA-F]+);'):lambda result:unichr(int(result.group(2),result.group(1)=='x' and 16 or 10))})
dp={"amp":u"&","quot":u"”","ldquo":u"“","rdquo":u"”","lt":u"<","gt":u">","lsquo":u"‘","rsquo":u"’"}
for k,v in html_str.items():
vu = dp.has_key(k) and dp[k] or unichr(v)
html_pattern.append({re.compile(r"&%s;" %(k)):vu})
html_pattern.append({re.compile(r'(\t){2,}'):r'\t'})
html_pattern.append({re.compile(r'(\r){2,}'):r'\r'})
html_pattern.append({re.compile(r'(\n){2,}'):r'\n'})
html_pattern.append({re.compile(r'( ){2,}'):u' '})
html_pattern.append({re.compile(r'^(\n)|(\n)$'):''})
for i in html_pattern:
for k,v in i.items():
s = k.sub(v,s)
return s.encode("utf8")
#过滤函数
def strip_tags(strs):
if strs:
strs = re.sub(r'<script(.*?)</script>','',strs)
strs = re.sub(r'<SCRIPT(.*?)</SCRIPT>','',strs)
strs = re.sub(r'<style(.*?)<\/style>','',strs)
strs = re.sub(r'\n|\r|\t',r'\n',strs)
strs = re.sub(r'<(br|BR)( ){0,}>',r'\n',strs)
strs = re.sub(r'<\/(p|P)>',r'\n',strs)
strs = re.sub(r'<.*?>','',strs)
strs = re.sub(r'图',"",strs)
strs = strip_code(strs)
#strs = re.escape(strs)
return strs
#创建一万个目录
def mdir_yiwan(path):
if path:
for i in range(100):
ph1 = path+"%02d/" %i
if not os.path.exists(ph1):
os.makedirs(ph1)
for j in range(100):
ph2 = ph1+"%02d/" %j
if not os.path.exists(ph2):
os.makedirs(ph2)
#由字母数字组成的唯一文件名
def new_file_name(path):
str_list="%s%s" %(string.digits,string.letters)
while True:
suffix="".join(random.sample(str_list,6))
fname="%s%s" %(path,suffix)
if os.path.exists(fname):
continue
else:
return fname
#字典形式记录日志
def write_log(path,info):
file = open(path,"w")
pickle.dump(info,file)
file.close()
#数据操作 (标题,时间,简介,图片,类型)
def in_data(title,dates,intro,img=None,typeid=0,url=None):
title = title_filter(title)
intro = strip_tags(intro)
if title and dates and intro:
typeid = typeid > 3 and 3 or typeid
type_arr = ['qt','rh','om','hy']
newid = 0
region = type_arr[typeid]
print url
print title
sql = '''select * from music_news_info where title like "%s%%" limit 1''' %(title)
mdb = music_db()
row = mdb.fetchone(sql)
if not row:
img = img_resize(img)
today = datetime.datetime.now().strftime("%Y-%m-%d")
logs = "%s\t%s\n" %(title,url)
logfile = open('/www/scripts/music/zsc/news/log/%s' %(today),'a')
logfile.write(logs)
logfile.close()
if img:
title += "(图)".decode("gb18030").encode("utf8")
print title
objid = 0
objtype = "none"
sql = "select * from music_album_info order by albumname desc "#匹配专辑
res = mdb.fetchall(sql)
if len(res)>0:
for v in res:
try:
if len(re.findall(r"%s" %(v['albumname']),title))>0:
objid = v['albumid']
objtype = "album"
break
except Exception,e:
pass
if objid == 0:
sql = "select * from music_artist_info order by artistname desc"#匹配歌手
res = mdb.fetchall(sql)
if len(res)>0:
for v in res:
try:
if v['artistid'] != 1022953 and len(re.findall(r"%s" %(v['artistname']),title))>0:
objid = v['artistid']
objtype = "artist"
break
except Exception,e:
pass
sql = '''insert into music_news_info set title="%s", region="%s",objtype="%s",objid="%s",updatetime="%s"''' %(title,region,objtype,objid,dates)
newid = mdb.query(sql,1)
print newid
if newid>0:
newid = str(newid)
index = newid[-1]
newtable = "music_news_content_%s" %(index)
sql = '''create table if not exists %s like %s; ''' %(newtable,"music_news_content")
mdb.query(sql)
sql = '''insert into %s set newsid="%s",content="%s" ''' %(newtable,newid,intro)
mdb.query(sql)
if img:
sql = '''insert into music_news_images set newsid="%s",path="%s",serverpos="%s" ''' %(newid,img,"192.168.0.29")
mdb.query(sql)
#缩图
#infile 远程文件路径
def img_resize(infile):
if infile:
try:
data = urllib.urlopen(infile).read()
except Exception,e:
return
ext = re.sub(r".*\.",r"",infile)#获得文件扩展名
t = repr(time.time())#取时间戳
t = re.split(r'\.',t)
rand = "".join(random.sample(string.digits,4))#生成0-9四位随机数
today = datetime.datetime.now().strftime("%Y-%m-%d")#当天日期
dtpath = "/www/scripts/music/zsc/news/img/"
ndir = "%s" %(today)#存储目录 若不存在创建
if os.path.isdir(dtpath+ndir) is False:
os.mkdir(dtpath+ndir)
nfile2 = "%s/%s_%s_%s.%s" %(ndir,t[0],t[1][:4],rand,ext)#文件
nfile = dtpath+nfile2
try:
f = file(nfile,"wb") #下载
f.write(data)
f.close()
except Exception,e:
return
inf = "%s" %(nfile)
img = Image.open(inf)#打开原图
if img:
size = img.size#图片属性
m = 240#图片长或宽最大值
w = size[0]
h = size[1]
if w>h and w>m:#宽大于高
h = h*m/w
w = m
elif h>w and h>m:#高大于宽
w = w*m/h
h = m
elif h==w and w>m:#
w = m
h = m
nimg = img.resize((w,h))#缩图
nimg.save(nfile)#覆盖原图
return nfile2