python84

anzhi84.py

#!/usr/env  python
#-*- coding: utf-8  -*-
import requests
import os,sys 
import time
import MySQLdb
import re
num=0
dataresult=[]
def main():
    try:
        conn=MySQLdb.connect(host='localhost',user='root',passwd='123456',db='addressbookdb',charset="utf8")
        conn.query("set names utf8")
    except Exception,e:
        print e
        sys.exit() 
    cursor=conn.cursor() 
    for k in range(1,7773):
        try:
            
            url="http://www.anzhi.com/list_1_"+str(k)+"_hot.html"
            print url
            html=requests.get(url)
            result=html.content
            pattern=re.compile('<span class="app_name"><a href="(.+?)">')
            daresult=re.findall(pattern,result)
            global dataresult
            dataresult+=daresult
            dataresult=list(set(dataresult))
            print len(dataresult)
        except:
            
            time.sleep(30)
            pass
       
                
    f=file("anzhi.txt","a+")
    content=str(len(dataresult))
    f.write(content)
    f.close()
    print len(dataresult)
    for i in dataresult:
        
        print i
        t='http://www.anzhi.com/'+i
        try:
            html=requests.get(t)
            result=html.content
        except:
            time.sleep(30)
            pass
        pattern=re.compile('<div class="detail_line">[\s\S]*?<h3>(.+?)</h3>')#名称
        data0=re.findall(pattern,result)
        print data0[0]
        pattern=re.compile('<span class="app_detail_version">(.+?)</span>')#版本号
        data1=re.findall(pattern,result)
        print data1[0]
        pattern=re.compile('开发者:(.+?)</span>')#开发者
        data2=re.findall(pattern,result)
        print data2[0]
        pattern=re.compile('发布时间:(.+?)</li>')#发布时间
        data3=re.findall(pattern,result)
        print data3[0]
        pattern=re.compile('文件大小:(.+?)</span></li>')#文件大小
        data4=re.findall(pattern,result)
        print data4[0]
        pattern=re.compile('系统支持:(.+?)</li>')#支持固件
        data5=re.findall(pattern,result)
        print data5[0]
        pattern=re.compile('所属类别:(.+?)</li>')#类别
        data6=re.findall(pattern,result)
        print data6[0]
        pattern=re.compile('<div class="app_detail_infor">([\s\S]*?)</div>')#介绍
        data7=re.findall(pattern,result)
        for items in data7:
            print re.sub('<br />',' ',items)
        sql="insert into anzhi(name,version,developer,pubtime,filesize,support,classifyintroduction) values(%s,%s,%s,%s,%s,%s,%s,%s)"
        for items in data7:
            try:
                
                values=(data0[0],data1[0],data2[0],data3[0],data4[0],data5[0],data6[0],re.sub('</p> <br />',' ',items))
                
            except:
                pass
            try:
                cursor.execute(sql,values)
                conn.commit()
            except:
                pass
        pattern=re.compile('<div class="detail_icon">[\s\S]*?<img src=(.+?)')
        data=re.findall(pattern,result)
        for j in data:
            print j
   
            try:
                temp=requests.get(j[1:-2])
            except:
                time.sleep(30)
                pass
        global num
        f=file("anzhi/"+str(num),"w+")
        num=num+1
        print num
        f.write(temp.content)
    cursor.close()
    conn.close()
    f.close()
if  __name__=="__main__":
       main()
       


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值