#!/usr/env python
#-*- coding: utf-8 -*-
import requests
import os,sys
import MySQLdb
import time
from BeautifulSoup import BeautifulSoup
import re
num=0
dataresult=[]
def main():
try:
conn=MySQLdb.connect(host='localhost',user='root',passwd='123456',db='addressbookdb',charset="utf8")
conn.query("set names utf8")
except Exception,e:
print e
sys.exit()
cursor=conn.cursor()
category=['PERSONALIZATION','TRANSPORTATION','SPORTS','HEALTH_AND_FITNESS','APP_WALLPAPER','COMICS','MEDICAL','BUSINESS','BOOKS_AND_REFERENCE','WEATHER','ENTERTAINMENT','MEDIA_AND_VIDEO','APP_WIDGETS','TOOLS','PHOTOGRAPHY','PRODUCTIVITY','EDUCATION','NEWS_AND_MAGAZINES','TRAVEL_AND_LOCAL','LIFESTYLE','SOCIAL','FINANCE','SHOPPING','LIBRARIES_AND_DEMO','COMMUNICATION','MUSIC_AND_AUDIO','GAME']
for k in range(0,27):
print k
t="https://play.google.com/store/apps/category/"+category[k]
try:
html=requests.get(t)
preresult=html.content
soup=BeautifulSoup(preresult)
result=soup.prettify("utf-8")
except:
time.sleep(30)
pass
pattern=re.compile('<a class="title" href="(.+?)" title')
dataresult=re.findall(pattern,result)
for i in dataresult:
url="https://play.google.com"+i
try:
html=requests.get(url)
preresult=html.content
soup=BeautifulSoup(preresult)
result=soup.prettify("utf-8")
except:
time.sleep(30)
pass
#名称
pattern=re.compile('<div class="document-title" itemprop="name">[\s\S]*?<div>([\s\S]*?)</div>')
data0=re.findall(pattern,result)
for items in data0:
print items
#制造商
pattern=re.compile('itemprop="name">([\s\S]*?)</a>')
data1=re.findall(pattern,result)
make=data1[0].split("\n")
print make[8]
#版本
pattern=re.compile('itemprop="softwareVersion">([\s\S]*?)</div>')
data2=re.findall(pattern,result)
print data2[0]
#更新时间
pattern=re.compile('itemprop="datePublished">([\s\S]*?)</div>')
data3=re.findall(pattern,result)
print data3[0]
#文件大小
pattern=re.compile('itemprop="fileSize">([\s\S]*?)</div>')
data4=re.findall(pattern,result)
print data4[0]
#支持固件import sys
pattern=re.compile('itemprop="operatingSystems">([\s\S]*?)</div>')
data5=re.findall(pattern,result)
print data5[0]
#说明
pattern=re.compile('itemprop="description">[\s\S]*?<div>([\s\S]*?)</div>')
data6=re.findall(pattern,result)
sql="insert into googlemarket(name,developer,version,pubtime,filesize,support,classify,introduction) values(%s,%s,%s,%s,%s,%s,%s,%s)"
for items in data6:
values=(data0[0],make[8],data2[0],data3[0],data4[0],data5[0],category[k],re.sub('[<br /> <p> </p>]',' ',items))
print sql %values
print category[k]
try:
cursor.execute(sql,values)
conn.commit()
except:
pass
pattern=re.compile('<img class="cover-image" src=(.+?) alt="Cover art" itemprop="image" />')
data=re.findall(pattern,result)
global num
for j in data:
print j
try:
temp=requests.get(j[1:-2],)
except:
time.sleep(30)
pass
f=file("googlemarket/"+str(num),"w+")
num=num+1
print num
f.write(temp.content)
if __name__=="__main__":
main()