python crawler0729.py

本文介绍了一种使用Python爬取Google Play商店中各类别应用信息的方法,包括应用名称、制造商、版本等,并将数据存入MySQL数据库。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >




#!/usr/env  python
#-*- coding: utf-8  -*-
import requests
import os,sys 
import MySQLdb
import time
from BeautifulSoup import BeautifulSoup
import re
num=0
dataresult=[]
def main():
	try:
		conn=MySQLdb.connect(host='localhost',user='root',passwd='123456',db='addressbookdb',charset="utf8")
		conn.query("set names utf8")
	except Exception,e:
		print e
		sys.exit()
	cursor=conn.cursor() 
	category=['PERSONALIZATION','TRANSPORTATION','SPORTS','HEALTH_AND_FITNESS','APP_WALLPAPER','COMICS','MEDICAL','BUSINESS','BOOKS_AND_REFERENCE','WEATHER','ENTERTAINMENT','MEDIA_AND_VIDEO','APP_WIDGETS','TOOLS','PHOTOGRAPHY','PRODUCTIVITY','EDUCATION','NEWS_AND_MAGAZINES','TRAVEL_AND_LOCAL','LIFESTYLE','SOCIAL','FINANCE','SHOPPING','LIBRARIES_AND_DEMO','COMMUNICATION','MUSIC_AND_AUDIO','GAME']	
	for k in range(0,27):
        print k
		t="https://play.google.com/store/apps/category/"+category[k]
        try:
            html=requests.get(t)
            preresult=html.content
            soup=BeautifulSoup(preresult)
            result=soup.prettify("utf-8")
        except:
            time.sleep(30)
            pass
        pattern=re.compile('<a class="title" href="(.+?)" title')
        dataresult=re.findall(pattern,result)
        for i in dataresult:
            url="https://play.google.com"+i
            	
           
            try:
                
                html=requests.get(url)
                preresult=html.content
                soup=BeautifulSoup(preresult)
                result=soup.prettify("utf-8")
            except:
                time.sleep(30)
                pass
            #名称
            pattern=re.compile('<div class="document-title" itemprop="name">[\s\S]*?<div>([\s\S]*?)</div>')
            data0=re.findall(pattern,result)
            for items in data0:
                print items
            #制造商
            pattern=re.compile('itemprop="name">([\s\S]*?)</a>')
            data1=re.findall(pattern,result)
		
            make=data1[0].split("\n")
		
            print make[8]
			#版本
            pattern=re.compile('itemprop="softwareVersion">([\s\S]*?)</div>')
            data2=re.findall(pattern,result)
            print data2[0]
            #更新时间 
            pattern=re.compile('itemprop="datePublished">([\s\S]*?)</div>')
            data3=re.findall(pattern,result)
            print data3[0]
            #文件大小
            pattern=re.compile('itemprop="fileSize">([\s\S]*?)</div>')
            data4=re.findall(pattern,result)
            print data4[0]
            #支持固件import sys

            pattern=re.compile('itemprop="operatingSystems">([\s\S]*?)</div>')
            data5=re.findall(pattern,result)
            print data5[0]
            #说明
            pattern=re.compile('itemprop="description">[\s\S]*?<div>([\s\S]*?)</div>')
            data6=re.findall(pattern,result)
            sql="insert into googlemarket(name,developer,version,pubtime,filesize,support,classify,introduction) values(%s,%s,%s,%s,%s,%s,%s,%s)"
            for items in data6:
                values=(data0[0],make[8],data2[0],data3[0],data4[0],data5[0],category[k],re.sub('[<br /> <p> </p>]',' ',items))
                print sql %values
                print category[k]
                try:
                    cursor.execute(sql,values)
                    conn.commit()
                except:
                    pass
            pattern=re.compile('<img class="cover-image" src=(.+?) alt="Cover art" itemprop="image" />')
            data=re.findall(pattern,result)
            global num
            for j in data:
                print j
                
              
                try:
                    temp=requests.get(j[1:-2],)
                except:
                    time.sleep(30)
                    pass
               
                    
            f=file("googlemarket/"+str(num),"w+")
            num=num+1
            print num
            f.write(temp.content)
		
	
	

if  __name__=="__main__":
       main()


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值