实验要求
实验目的
爬取目标http://www.instagram.com,采集主题为"network security"的帖子信息
实验要求
- 采集内容:用户简介、发帖数、关注数、被关注数、发布的图片文件、发布时间、点赞数、评论数;
- 基本指标:
(1)支持对可公开访问的特定Instagram账号(用户)发布的信息进行采集,采集的用户数量不少于100个;
(2)采集的内容需存储在数据库系统中。
代码
isn.py
# -*- coding: utf-8 -*-
import requests
import json
import urllib
import re
import time
import csv
import codecs
import pymysql
s = requests.session()
s.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1',
}
search = "networksecurity" #search里边是搜索内容
website = "http://www.instagram.com"
q = urllib.parse.quote(search)
print(q)
url1 = website+"/explore/tags/"+q+"/?__a=1"
requests.adapters.DEFAULT_RETRIES = 5
html = s.get(url1)
ans = json.loads(html.text)
pgn = 0
########################################
# 添加数据库
db = pymysql.connect(host="localhost", port=3306, user="mysql账户名", password="mysql登录密码", db="Instagram", charset="utf8")
cursor = db.cursor()
cursor.execute("DROP TABLE IF EXISTS Info")
createTable = """CREATE TABLE Info(
Username VARCHAR (50) NOT NULL ,
PostsNumber VARCHAR (8) NOT NULL ,
FansNumber VARCHAR (8) NOT NULL ,
AttentionNumber VARCHAR (8) NOT NULL ,
Ptime VARCHAR (20) NOT NULL ,
PicURL VARCHAR (500) NOT NULL ,
CommentNumber VARCHAR (8) NOT NULL ,
LikeNumber VARCHAR (8) NOT NULL) ENGINE MyISAM DEFAULT CHARSET=utf8"""
cursor.execute(createTable)
f = open("./Save/"+str(search)+".txt","w",encoding='utf-8')
csvfile = codecs.open("./Save/"+str(search)+".csv", 'wb',encoding='gb18030')
result = []
########################################
writer = csv.writer(csvfile)
data=['用户名', '用户简介', '已发帖', '粉丝数', '关注的人数', '发布时间', '图片url', '内容', '评论数', '点赞数']
writer.writerow(data)
#
#
#
edges = ans['graphql']['hashtag']['edge_hashtag_to_top_posts']['edges']
n = 0
for i in range (len(edges)):
temp_dict = {
}
if len(edges[i]['node']['edge_media_to_caption']['edges']) == 0:
continue
d = edges[i]['node']['edge_media_to_caption']['edges'][0]['node']['text']
shortcode = edges[i]['node']['shortcode']
url2 = website+"/p/"+shortcode+"/?__a=1"
getnt = s.get(url2, verify=False)
getnt = json.loads(getnt.text)
username = getnt['graphql']['shortcode_media']['owner']['username']
ptime = getnt['graphql']