下面是我优达学城项目三的记录报告
里面的思路和文字说明大多都在代码块里面的注释中,#后面?,可能不太容易看,需要认真看。?
#导入可能需要的包
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import os
import requests
from pprint import pprint
import re
1. 收集
# 通过编程方式获得文件,1)image-predictions.tsv
url = "https://raw.githubusercontent.com/udacity/new-dand-advanced-china/master/%E6%95%B0%E6%8D%AE%E6%B8%85%E6%B4%97/WeRateDogs%E9%A1%B9%E7%9B%AE/image-predictions.tsv"
response = requests.get(url)
with open (os.path.join(url.split("/")[-1]),mode="wb" ) as file:
file.write(response.content) #把文本文件写入文件中去。存下来
print("下载完毕!")
#!ls
下载完毕!
# 通过编程方式下载文件,2)twitter-archive-enhanced.csv 下载完成
url2 = "https://raw.githubusercontent.com/udacity/new-dand-advanced-china/master/%E6%95%B0%E6%8D%AE%E6%B8%85%E6%B4%97/WeRateDogs%E9%A1%B9%E7%9B%AE/twitter-archive-enhanced.csv"
# url = "https://raw.githubusercontent.com/udacity/new-dand-advanced-china/master/%E6%95%B0%E6%8D%AE%E6%B8%85%E6%B4%97/WeRateDogs%E9%A1%B9%E7%9B%AE/image-predictions.tsv"
response2 = requests.get(url2)
with open (os.path.join(url2.split("/")[-1]),mode="wb" ) as file:
file.write(response2.content) #把文本文件写入文件中去。存下来
print("下载完毕!")
#!ls
下载完毕!
# 通过编程方式下载文件,3)twitter-archive-enhanced.csv 下载完成
url3 = "https://raw.githubusercontent.com/udacity/new-dand-advanced-china/master/%E6%95%B0%E6%8D%AE%E6%B8%85%E6%B4%97/WeRateDogs%E9%A1%B9%E7%9B%AE/tweet_json.txt"
response3 = requests.get(url3)
with open (os.path.join(url3.split("/")[-1]),mode="wb" ) as file:
file.write(response3.content) #把文本文件写入文件中去。存下来
print("下载完毕!")
#!ls #检查文件是否有下载到路径中去。
下载完毕!
2.评估
陆续读取打开各个文件以查看数据情况
#先读取
#读取出这几个文件
twitter_achieve = pd.read_csv("twitter-archive-enhanced.csv")
image_predictions = pd.read_csv("image-predictions.tsv",sep="\t")
twitter_achieve[twitter_achieve['expanded_urls'].isnull()]
tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
30 | 886267009285017600 | 8.862664e+17 | 2.281182e+09 | 2017-07-15 16:51:35 +0000 | <a href="http://twitter.com/download/iphone" r... | @NonWhiteHat @MayhewMayhem omg hello tanner yo... | NaN | NaN | NaN | NaN | 12 | 10 | None | None | None | None | None |
55 | 881633300179243008 | 8.816070e+17 | 4.738443e+07 | 2017-07-02 21:58:53 +0000 | <a href="http://twitter.com/download/iphone" r... | @roushfenway These are good dogs but 17/10 is ... | NaN | NaN | NaN | NaN | 17 | 10 | None | None | None | None | None |
64 | 879674319642796034 | 8.795538e+17 | 3.105441e+09 | 2017-06-27 12:14:36 +0000 | <a href="http://twitter.com/download/iphone" r... | @RealKentMurphy 14/10 confirmed | NaN | NaN | NaN | NaN | 14 | 10 | None | None | None | None | None |
113 | 870726314365509632 | 8.707262e+17 | 1.648776e+07 | 2017-06-02 19:38:25 +0000 | <a href="http://twitter.com/download/iphone" r... | @ComplicitOwl @ShopWeRateDogs >10/10 is res... | NaN | NaN | NaN | NaN | 10 | 10 | None | None | None | None | None |
148 | 863427515083354112 | 8.634256e+17 | 7.759620e+07 | 2017-05-13 16:15:35 +0000 | <a href="http://twitter.com/download/iphone" r... | @Jack_Septic_Eye I'd need a few more pics to p... | NaN | NaN | NaN | NaN | 12 | 10 | None | None | None | None | None |
179 | 857214891891077121 | 8.571567e+17 | 1.806710e+08 | 2017-04-26 12:48:51 +0000 | <a href="http://twitter.com/download/iphone" r... | @Marc_IRL pixelated af 12/10 | NaN | NaN | NaN | NaN | 12 | 10 | None | None | None | None | None |
185 | 856330835276025856 | NaN | NaN | 2017-04-24 02:15:55 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @Jenna_Marbles: @dog_rates Thanks for ratin... | 8.563302e+17 | 66699013.0 | 2017-04-24 02:13:14 +0000 | NaN | 14 | 10 | None | None | None | None | None |
186 | 856288084350160898 | 8.562860e+17 | 2.792810e+08 | 2017-04-23 23:26:03 +0000 | <a href="http://twitter.com/download/iphone" r... | @xianmcguire @Jenna_Marbles Kardashians wouldn... | NaN | NaN | NaN | NaN | 14 | 10 | None | None | None | None | None |
188 | 855862651834028034 | 8.558616e+17 | 1.943518e+08 | 2017-04-22 19:15:32 +0000 | <a href="http://twitter.com/download/iphone" r... | @dhmontgomery We also gave snoop dogg a 420/10... | NaN | NaN | NaN | NaN | 420 | 10 | None | None | None | None | None |
189 | 855860136149123072 | 8.558585e+17 | 1.361572e+07 | 2017-04-22 19:05:32 +0000 | <a href="http://twitter.com/download/iphone" r... | @s8n You tried very hard to portray this good ... | NaN | NaN | NaN | NaN | 666 | 10 | None | None | None | None | None |
218 | 850333567704068097 | 8.503288e+17 | 2.195506e+07 | 2017-04-07 13:04:55 +0000 | <a href="http://twitter.com/download/iphone" r... | @markhoppus MARK THAT DOG HAS SEEN AND EXPERIE... | NaN | NaN | NaN | NaN | 13 | 10 | None | None | None | None | None |
228 | 848213670039564288 | 8.482121e+17 | 4.196984e+09 | 2017-04-01 16:41:12 +0000 | <a href="http://twitter.com/download/iphone" r... | Jerry just apuppologized to me. He said there ... | NaN | NaN | NaN | NaN | 11 | 10 | None | None | None | None | None |
234 | 847617282490613760 | 8.476062e+17 | 4.196984e+09 | 2017-03-31 01:11:22 +0000 | <a href="http://twitter.com/download/iphone" r... | .@breaannanicolee PUPDATE: Cannon has a heart ... | NaN | NaN | NaN | NaN | 13 | 10 | None | None | None | None | None |
274 | 840698636975636481 | 8.406983e+17 | 8.405479e+17 | 2017-03-11 22:59:09 +0000 | <a href="http://twitter.com/download/iphone" r... | @0_kelvin_0 >10/10 is reserved for puppos s... | NaN | NaN | NaN | NaN | 10 | 10 | None | None | None | None | None |
290 | 838150277551247360 | 8.381455e+17 | 2.195506e+07 | 2017-03-04 22:12:52 +0000 | <a href="http://twitter.com/download/iphone" r... | @markhoppus 182/10 | NaN | NaN | NaN | NaN | 182 | 10 | None | None | None | None | None |
291 | 838085839343206401 | 8.380855e+17 | 2.894131e+09 | 2017-03-04 17:56:49 +0000 | <a href="http://twitter.com/download/iphone" r... | @bragg6of8 @Andy_Pace_ we are still looking fo... | NaN | NaN | NaN | NaN | 15 | 10 | None | None | None | None | None |
313 | 835246439529840640 | 8.352460e+17 | 2.625958e+07 | 2017-02-24 21:54:03 +0000 | <a href="http://twitter.com/download/iphone" r... | @jonnysun @Lin_Manuel ok jomny I know you're e... | NaN | NaN | NaN | NaN | 960 | 0 | None | None | None | None | None |
342 | 832088576586297345 | 8.320875e+17 | 3.058208e+07 | 2017-02-16 04:45:50 +0000 | <a href="http://twitter.com/download/iphone" r... | @docmisterio account started on 11/15/15 | NaN | NaN | NaN | NaN | 11 | 15 | None | None | None | None | None |
346 | 831926988323639298 | 8.319030e+17 | 2.068372e+07 | 2017-02-15 18:03:45 +0000 | <a href="http://twitter.com/download/iphone" r... | @UNC can confirm 12/10 | NaN | NaN | NaN | NaN | 12 | 10 | None | None | None | None | None |
375 | 828361771580813312 | NaN | NaN | 2017-02-05 21:56:51 +0000 | <a href="http://twitter.com" rel="nofollow">Tw... | Beebop and Doobert should start a band 12/10 w... | NaN | NaN | NaN | NaN | 12 | 10 | None | None | None | None | None |
387 | 826598799820865537 | 8.265984e+17 | 4.196984e+09 | 2017-02-01 01:11:25 +0000 | <a href="http://twitter.com/download/iphone" r... | I was going to do 007/10, but the joke wasn't ... | NaN | NaN | NaN | NaN | 7 | 10 | None | None | None | None | None |
409 | 823333489516937216 | 8.233264e+17 | 1.582854e+09 | 2017-01-23 00:56:15 +0000 | <a href="http://twitter.com/download/iphone" r... | @HistoryInPics 13/10 | NaN | NaN | NaN | NaN | 13 | 10 | None | None | None | None | None |
427 | 821153421864615936 | 8.211526e+17 | 1.132119e+08 | 2017-01-17 00:33:26 +0000 | <a href="http://twitter.com/download/iphone" r... | @imgur for a polar bear tho I'd say 13/10 is a... | NaN | NaN | NaN | NaN | 13 | 10 | None | None | None | None | None |
498 | 813130366689148928 | 8.131273e+17 | 4.196984e+09 | 2016-12-25 21:12:41 +0000 | <a href="http://twitter.com/download/iphone" r... | I've been informed by multiple sources that th... | NaN | NaN | NaN | NaN | 12 | 10 | None | None | None | None | None |
513 | 811647686436880384 | 8.116272e+17 | 4.196984e+09 | 2016-12-21 19:01:02 +0000 | <a href="http://twitter.com/download/iphone" r... | PUPDATE: I've been informed that Augie was act... | NaN | NaN | NaN | NaN | 11 | 10 | None | None | None | None | None |
570 | 801854953262350336 | 8.018543e+17 | 1.185634e+07 | 2016-11-24 18:28:13 +0000 | <a href="http://twitter.com/download/iphone" r... | .@NBCSports OMG THE TINY HAT I'M GOING TO HAVE... | NaN | NaN | NaN | NaN | 11 | 10 | None | None | None | None | None |
576 | 800859414831898624 | 8.008580e+17 | 2.918590e+08 | 2016-11-22 00:32:18 +0000 | <a href="http://twitter.com/download/iphone" r... | @SkyWilliams doggo simply protecting you from ... | NaN | NaN | NaN | NaN | 11 | 10 | None | doggo | None | None | None |
611 | 797165961484890113 | 7.971238e+17 | 2.916630e+07 | 2016-11-11 19:55:50 +0000 | <a href="http://twitter.com/download/iphone" r... | @JODYHiGHROLLER it may be an 11/10 but what do... | NaN | NaN | NaN | NaN | 11 | 10 | None | None | None | None | None |
701 | 786051337297522688 | 7.727430e+17 | 7.305050e+17 | 2016-10-12 03:50:17 +0000 | <a href="http://twitter.com/download/iphone" r... | 13/10 for breakdancing puppo @shibbnbot | NaN | NaN | NaN | NaN | 13 | 10 | None | None | None | None | puppo |
707 | 785515384317313025 | NaN | NaN | 2016-10-10 16:20:36 +0000 | <a href="http://twitter.com/download/iphone" r... | Today, 10/10, should be National Dog Rates Day | NaN | NaN | NaN | NaN | 10 | 10 | None | None | None | None | None |
843 | 766714921925144576 | 7.667118e+17 | 4.196984e+09 | 2016-08-19 19:14:16 +0000 | <a href="http://twitter.com/download/iphone" r... | His name is Charley and he already has a new s... | NaN | NaN | NaN | NaN | 13 | 10 | None | None | None | None | None |
857 | 763956972077010945 | 7.638652e+17 | 1.584641e+07 | 2016-08-12 04:35:10 +0000 | <a href="http://twitter.com/download/iphone" r... | @TheEllenShow I'm not sure if you know this bu... | NaN | NaN | NaN | NaN | 12 | 10 | None | doggo | None | None | None |
967 | 750381685133418496 | 7.501805e+17 | 4.717297e+09 | 2016-07-05 17:31:49 +0000 | <a href="http://twitter.com/download/iphone" r... | 13/10 such a good doggo\n@spaghemily | NaN | NaN | NaN | NaN | 13 | 10 | None | doggo | None | None | None |
1005 | 747651430853525504 | 7.476487e+17 | 4.196984e+09 | 2016-06-28 04:42:46 +0000 | <a href="http://twitter.com/download/iphone" r... | Other pupper asked not to have his identity sh... | NaN | NaN | NaN | NaN | 12 | 10 | None | None | None | pupper | None |
1080 | 738891149612572673 | 7.384119e+17 | 3.589728e+08 | 2016-06-04 00:32:32 +0000 | <a href="http://twitter.com/download/iphone" r... | @mount_alex3 13/10 | NaN | NaN | NaN | NaN | 13 | 10 | None | None | None | None | None |
1295 | 707983188426153984 | 7.079801e+17 | 2.319108e+09 | 2016-03-10 17:35:20 +0000 | <a href="http://twitter.com/download/iphone" r... | @serial @MrRoles OH MY GOD I listened to all o... | NaN | NaN | NaN | NaN | 12 | 10 | None | None | None | None | None |
1345 | 704491224099647488 | 7.044857e+17 | 2.878549e+07 | 2016-03-01 02:19:31 +0000 | <a href="http://twitter.com/download/iphone" r... | 13/10 hero af\n@ABC | NaN | NaN | NaN | NaN | 13 | 10 | None | None | None | None | None |
1445 | 696518437233913856 | NaN | NaN | 2016-02-08 02:18:30 +0000 | <a href="http://twitter.com/download/iphone" r... | Oh my god 10/10 for every little hot dog pupper | NaN | NaN | NaN | NaN | 10 | 10 | None | None | None | pupper | None |
1446 | 696490539101908992 | 6.964887e+17 | 4.196984e+09 | 2016-02-08 00:27:39 +0000 | <a href="http://twitter.com/download/iphone" r... | After reading the comments I may have overesti... | NaN | NaN | NaN | NaN | 1 | 10 | None | None | None | None | None |
1474 | 693644216740769793 | 6.936422e+17 | 4.196984e+09 | 2016-01-31 03:57:23 +0000 | <a href="http://twitter.com/download/iphone" r... | BREAKING PUPDATE: I've just been notified that... | NaN | NaN | NaN | NaN | 10 | 10 | None | None | None | None | None |
1479 | 693582294167244802 | 6.935722e+17 | 1.198989e+09 | 2016-01-30 23:51:19 +0000 | <a href="http://twitter.com/download/iphone" r... | Personally I'd give him an 11/10. Not sure why... | NaN | NaN | NaN | NaN | 11 | 10 | None | None | None | None | None |
1497 | 692423280028966913 | 6.924173e+17 | 4.196984e+09 | 2016-01-27 19:05:49 +0000 | <a href="http://twitter.com/download/iphone" r... | PUPDATE: just noticed this dog has some extra ... | NaN | NaN | NaN | NaN | 9 | 10 | None | None | None | None | None |
1523 | 690607260360429569 | 6.903413e+17 | 4.670367e+08 | 2016-01-22 18:49:36 +0000 | <a href="http://twitter.com/download/iphone" r... | 12/10 @LightningHoltt | NaN | NaN | NaN | NaN | 12 | 10 | None | None | None | None | None |
1598 | 686035780142297088 | 6.860340e+17 | 4.196984e+09 | 2016-01-10 04:04:10 +0000 | <a href="http://twitter.com/download/iphone" r... | Yes I do realize a rating of 4/20 would've bee... | NaN | NaN | NaN | NaN | 4 | 20 | None | None | None | None | None |
1605 | 685681090388975616 | 6.855479e+17 | 4.196984e+09 | 2016-01-09 04:34:45 +0000 | <a href="http://twitter.com/download/iphone" r... | Jack deserves another round of applause. If yo... | NaN | NaN | NaN | NaN | 14 | 10 | None | None | None | None | None |
1618 | 684969860808454144 | 6.849598e+17 | 4.196984e+09 | 2016-01-07 05:28:35 +0000 | <a href="http://twitter.com/download/iphone" r... | For those who claim this is a goat, u are wron... | NaN | NaN | NaN | NaN | 5 | 10 | None | None | None | None | None |
1663 | 682808988178739200 | 6.827884e+17 | 4.196984e+09 | 2016-01-01 06:22:03 +0000 | <a href="http://twitter.com/download/iphone" r... | I'm aware that I could've said 20/16, but here... | NaN | NaN | NaN | NaN | 20 | 16 | None | None | None | None | None |
1689 | 681340665377193984 | 6.813394e+17 | 4.196984e+09 | 2015-12-28 05:07:27 +0000 | <a href="http://twitter.com/download/iphone" r... | I've been told there's a slight possibility he... | NaN | NaN | NaN | NaN | 5 | 10 | None | None | None | None | None |
1774 | 678023323247357953 | 6.780211e+17 | 4.196984e+09 | 2015-12-19 01:25:31 +0000 | <a href="http://twitter.com/download/iphone" r... | After getting lost in Reese's eyes for several... | NaN | NaN | NaN | NaN | 13 | 10 | None | None | None | None | None |
1819 | 676590572941893632 | 6.765883e+17 | 4.196984e+09 | 2015-12-15 02:32:17 +0000 | <a href="http://twitter.com/download/iphone" r... | After some outrage from the crowd. Bubbles is ... | NaN | NaN | NaN | NaN | 7 | 10 | None | None | None | None | None |
1844 | 675849018447167488 | 6.758457e+17 | 4.196984e+09 | 2015-12-13 01:25:37 +0000 | <a href="http://twitter.com/download/iphone" r... | This dog is being demoted to a 9/10 for not we... | NaN | NaN | NaN | NaN | 9 | 10 | None | None | None | None | None |
1895 | 674742531037511680 | 6.747400e+17 | 4.196984e+09 | 2015-12-10 00:08:50 +0000 | <a href="http://twitter.com/download/iphone" r... | Some clarification is required. The dog is sin... | NaN | NaN | NaN | NaN | 11 | 10 | None | None | None | None | None |
1905 | 674606911342424069 | 6.744689e+17 | 4.196984e+09 | 2015-12-09 15:09:55 +0000 | <a href="http://twitter.com/download/iphone" r... | The 13/10 also takes into account this impecca... | NaN | NaN | NaN | NaN | 13 | 10 | None | None | None | None | None |
1914 | 674330906434379776 | 6.658147e+17 | 1.637468e+07 | 2015-12-08 20:53:11 +0000 | <a href="http://twitter.com/download/iphone" r... | 13/10\n@ABC7 | NaN | NaN | NaN | NaN | 13 | 10 | None | None | None | None | None |
1940 | 673716320723169284 | 6.737159e+17 | 4.196984e+09 | 2015-12-07 04:11:02 +0000 | <a href="http://twitter.com/download/iphone" r... | The millennials have spoken and we've decided ... | NaN | NaN | NaN | NaN | 1 | 10 | None | None | None | None | None |
2038 | 671550332464455680 | 6.715449e+17 | 4.196984e+09 | 2015-12-01 04:44:10 +0000 | <a href="http://twitter.com/download/iphone" r... | After 22 minutes of careful deliberation this ... | NaN | NaN | NaN | NaN | 1 | 10 | None | None | None | None | None |
2149 | 669684865554620416 | 6.693544e+17 | 4.196984e+09 | 2015-11-26 01:11:28 +0000 | <a href="http://twitter.com/download/iphone" r... | After countless hours of research and hundreds... | NaN | NaN | NaN | NaN | 11 | 10 | None | None | None | None | None |
2189 | 668967877119254528 | 6.689207e+17 | 2.143566e+07 | 2015-11-24 01:42:25 +0000 | <a href="http://twitter.com/download/iphone" r... | 12/10 good shit Bubka\n@wane15 | NaN | NaN | NaN | NaN | 12 | 10 | None | None | None | None | None |
2298 | 667070482143944705 | 6.670655e+17 | 4.196984e+09 | 2015-11-18 20:02:51 +0000 | <a href="http://twitter.com/download/iphone" r... | After much debate this dog is being upgraded t... | NaN | NaN | NaN | NaN | 10 | 10 | None | None | None | None | None |
twitter_achieve.head(3)
#大概查看数据集情况,发现质量问题
# 1)质量问题:source 里面包含html的标签,可以进一步提取
# 2)整洁性问题:狗的地位stage(体型而定的) 应该为分类数据,应该放在同一列中
tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 892420643555336193 | NaN | NaN | 2017-08-01 16:23:56 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Phineas. He's a mystical boy. Only eve... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/892420643... | 13 | 10 | Phineas | None | None | None | None |
1 | 892177421306343426 | NaN | NaN | 2017-08-01 00:17:27 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Tilly. She's just checking pup on you.... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/892177421... | 13 | 10 | Tilly | None | None | None | None |
2 | 891815181378084864 | NaN | NaN | 2017-07-31 00:18:03 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Archie. He is a rare Norwegian Pouncin... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891815181... | 12 | 10 | Archie | None | None | None | None |
twitter_achieve.name.value_counts()
#3)质量问题:狗的名字可以相同,但是 有异常的 a,None the an
None 745
a 55
Charlie 12
Oliver 11
Lucy 11
Cooper 11
Lola 10
Tucker 10
Penny 10
Bo 9
Winston 9
Sadie 8
the 8
Buddy 7
Daisy 7
Toby 7
Bailey 7
an 7
Scout 6
Jack 6
Dave 6
Oscar 6
Koda 6
Stanley 6
Leo 6
Milo 6
Rusty 6
Jax 6
Bella 6
Finn 5
...
Gustav 1
Andy 1
Pippin 1
Molly 1
Sage 1
Ashleigh 1
Schnozz 1
Shiloh 1
Margo 1
Tito 1
Brownie 1
my 1
Pherb 1
Colin 1
Buckley 1
Alexander 1
Kulet 1
Trigger 1
Aja 1
Petrick 1
Izzy 1
Milky 1
Dido 1
Kara 1
Wiggles 1
Carter 1
JD 1
by 1
Boston 1
Jarod 1
Name: name, Length: 957, dtype: int64
twitter_achieve.retweeted_status_user_id.notnull().value_counts()
# 4)retweeted_status_user_id 不为空的是 转发的推文 ,需要清理掉,只留下非转发的推文
False 2175
True 181
Name: retweeted_status_user_id, dtype: int64
twitter_achieve.info()
# 5)质量问题: 发现in_reply_to_status_id in_reply_to_user_id 都只有78条,基本都是空值,需要清理
# 6) 质量问题: tweet_id 用户id应该为字符串类型,而不是int64 数值类型,所以需要修改。后面读取完三个表后同样发现都需要统一成字符串类型
# 7) 质量问题: expanded_urls是推文的链接地址,存在缺失值,缺失的行可能已经失效,需要处理
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id 2356 non-null int64
in_reply_to_status_id 78 non-null float64
in_reply_to_user_id 78 non-null float64
timestamp 2356 non-null object
source 2356 non-null object
text 2356 non-null object
retweeted_status_id 181 non-null float64
retweeted_status_user_id 181 non-null float64
retweeted_status_timestamp 181 non-null object
expanded_urls 2297 non-null object
rating_numerator 2356 non-null int64
rating_denominator 2356 non-null int64
name 2356 non-null object
doggo 2356 non-null object
floofer 2356 non-null object
pupper 2356 non-null object
puppo 2356 non-null object
dtypes: float64(4), int64(3), object(10)
memory usage: 313.0+ KB
twitter_achieve.expanded_urls
# 8)整洁性问题,expanded_urls里面有写链接有多条链接重复在一起,用逗号分隔的,如6这种。
0 https://twitter.com/dog_rates/status/892420643...
1 https://twitter.com/dog_rates/status/892177421...
2 https://twitter.com/dog_rates/status/891815181...
3 https://twitter.com/dog_rates/status/891689557...
4 https://twitter.com/dog_rates/status/891327558...
5 https://twitter.com/dog_rates/status/891087950...
6 https://gofundme.com/ydvmve-surgery-for-jax,ht...
7 https://twitter.com/dog_rates/status/890729181...
8 https://twitter.com/dog_rates/status/890609185...
9 https://twitter.com/dog_rates/status/890240255...
10 https://twitter.com/dog_rates/status/890006608...
11 https://twitter.com/dog_rates/status/889880896...
12 https://twitter.com/dog_rates/status/889665388...
13 https://twitter.com/dog_rates/status/889638837...
14 https://twitter.com/dog_rates/status/889531135...
15 https://twitter.com/dog_rates/status/889278841...
16 https://twitter.com/dog_rates/status/888917238...
17 https://twitter.com/dog_rates/status/888804989...
18 https://twitter.com/dog_rates/status/888554962...
19 https://twitter.com/dog_rates/status/887473957...
20 https://twitter.com/dog_rates/status/888078434...
21 https://twitter.com/dog_rates/status/887705289...
22 https://twitter.com/dog_rates/status/887517139...
23 https://twitter.com/dog_rates/status/887473957...
24 https://twitter.com/dog_rates/status/887343217...
25 https://twitter.com/dog_rates/status/887101392...
26 https://twitter.com/dog_rates/status/886983233...
27 https://www.gofundme.com/mingusneedsus,https:/...
28 https://twitter.com/dog_rates/status/886680336...
29 https://twitter.com/dog_rates/status/886366144...
...
2326 https://twitter.com/dog_rates/status/666411507...
2327 https://twitter.com/dog_rates/status/666407126...
2328 https://twitter.com/dog_rates/status/666396247...
2329 https://twitter.com/dog_rates/status/666373753...
2330 https://twitter.com/dog_rates/status/666362758...
2331 https://twitter.com/dog_rates/status/666353288...
2332 https://twitter.com/dog_rates/status/666345417...
2333 https://twitter.com/dog_rates/status/666337882...
2334 https://twitter.com/dog_rates/status/666293911...
2335 https://twitter.com/dog_rates/status/666287406...
2336 https://twitter.com/dog_rates/status/666273097...
2337 https://twitter.com/dog_rates/status/666268910...
2338 https://twitter.com/dog_rates/status/666104133...
2339 https://twitter.com/dog_rates/status/666102155...
2340 https://twitter.com/dog_rates/status/666099513...
2341 https://twitter.com/dog_rates/status/666094000...
2342 https://twitter.com/dog_rates/status/666082916...
2343 https://twitter.com/dog_rates/status/666073100...
2344 https://twitter.com/dog_rates/status/666071193...
2345 https://twitter.com/dog_rates/status/666063827...
2346 https://twitter.com/dog_rates/status/666058600...
2347 https://twitter.com/dog_rates/status/666057090...
2348 https://twitter.com/dog_rates/status/666055525...
2349 https://twitter.com/dog_rates/status/666051853...
2350 https://twitter.com/dog_rates/status/666050758...
2351 https://twitter.com/dog_rates/status/666049248...
2352 https://twitter.com/dog_rates/status/666044226...
2353 https://twitter.com/dog_rates/status/666033412...
2354 https://twitter.com/dog_rates/status/666029285...
2355 https://twitter.com/dog_rates/status/666020888...
Name: expanded_urls, Length: 2356, dtype: object
twitter_achieve.rating_denominator.value_counts()
# 9)质量问题:评分的分母除了10,还有少量不为10,如11,2,7,10的倍数等,需要重新检查或者重新提取
10 2333
11 3
50 3
80 2
20 2
2 1
16 1
40 1
70 1
15 1
90 1
110 1
120 1
130 1
150 1
170 1
7 1
0 1
Name: rating_denominator, dtype: int64
image_predictions.info() #评估image_predictions 表
image_predictions.head(4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id 2075 non-null int64
jpg_url 2075 non-null object
img_num 2075 non-null int64
p1 2075 non-null object
p1_conf 2075 non-null float64
p1_dog 2075 non-null bool
p2 2075 non-null object
p2_conf 2075 non-null float64
p2_dog 2075 non-null bool
p3 2075 non-null object
p3_conf 2075 non-null float64
p3_dog 2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB
tweet_id | jpg_url | img_num | p1 | p1_conf | p1_dog | p2 | p2_conf | p2_dog | p3 | p3_conf | p3_dog | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 666020888022790149 | https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg | 1 | Welsh_springer_spaniel | 0.465074 | True | collie | 0.156665 | True | Shetland_sheepdog | 0.061428 | True |
1 | 666029285002620928 | https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg | 1 | redbone | 0.506826 | True | miniature_pinscher | 0.074192 | True | Rhodesian_ridgeback | 0.072010 | True |
2 | 666033412701032449 | https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg | 1 | German_shepherd | 0.596461 | True | malinois | 0.138584 | True | bloodhound | 0.116197 | True |
3 | 666044226329800704 | https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg | 1 | Rhodesian_ridgeback | 0.408143 | True | redbone | 0.360687 | True | miniature_pinscher | 0.222752 | True |
image_predictions[image_predictions['jpg_url'].duplicated()==True]['jpg_url'].value_counts()
# 10)image_predictions 有很多重复的图片链接有 66条之多,需要删除
https://pbs.twimg.com/media/CtzKC7zXEAALfSo.jpg 1
https://pbs.twimg.com/media/CvoBPWRWgAA4het.jpg 1
https://pbs.twimg.com/media/Co-hmcYXYAASkiG.jpg 1
https://pbs.twimg.com/media/CsrjryzWgAAZY00.jpg 1
https://pbs.twimg.com/media/Cp6db4-XYAAMmqL.jpg 1
https://pbs.twimg.com/media/CV_cnjHWUAADc-c.jpg 1
https://pbs.twimg.com/media/CdHwZd0VIAA4792.jpg 1
https://pbs.twimg.com/media/Crwxb5yWgAAX5P_.jpg 1
https://pbs.twimg.com/media/DFDw2tyUQAAAFke.jpg 1
https://pbs.twimg.com/media/CVgdFjNWEAAxmbq.jpg 1
https://pbs.twimg.com/media/CUN4Or5UAAAa5K4.jpg 1
https://pbs.twimg.com/media/CuRDF-XWcAIZSer.jpg 1
https://pbs.twimg.com/media/Cq9guJ5WgAADfpF.jpg 1
https://pbs.twimg.com/media/Ct2qO5PXEAE6eB0.jpg 1
https://pbs.twimg.com/media/CxqsX-8XUAAEvjD.jpg 1
https://pbs.twimg.com/ext_tw_video_thumb/817423809049493505/pu/img/5OFW0yueFu9oTUiQ.jpg 1
https://pbs.twimg.com/media/CWyD2HGUYAQ1Xa7.jpg 1
https://pbs.twimg.com/media/CYLDikFWEAAIy1y.jpg 1
https://pbs.twimg.com/media/CVuQ2LeUsAAIe3s.jpg 1
https://pbs.twimg.com/media/CvaYgDOWgAEfjls.jpg 1
https://pbs.twimg.com/media/C4bTH6nWMAAX_bJ.jpg 1
https://pbs.twimg.com/media/CW88XN4WsAAlo8r.jpg 1
https://pbs.twimg.com/media/CvJCabcWgAIoUxW.jpg 1
https://pbs.twimg.com/media/Ck2d7tJWUAEPTL3.jpg 1
https://pbs.twimg.com/media/CtKHLuCWYAA2TTs.jpg 1
https://pbs.twimg.com/media/Cwx99rpW8AMk_Ie.jpg 1
https://pbs.twimg.com/media/CvyVxQRWEAAdSZS.jpg 1
https://pbs.twimg.com/media/CvT6IV6WEAQhhV5.jpg 1
https://pbs.twimg.com/ext_tw_video_thumb/815965888126062592/pu/img/JleSw4wRhgKDWQj5.jpg 1
https://pbs.twimg.com/media/C3nygbBWQAAjwcW.jpg 1
..
https://pbs.twimg.com/media/CiyHLocU4AI2pJu.jpg 1
https://pbs.twimg.com/media/Ct72q9jWcAAhlnw.jpg 1
https://pbs.twimg.com/media/CkNjahBXAAQ2kWo.jpg 1
https://pbs.twimg.com/media/CwJR1okWIAA6XMp.jpg 1
https://pbs.twimg.com/tweet_video_thumb/CeBym7oXEAEWbEg.jpg 1
https://pbs.twimg.com/media/CsVO7ljW8AAckRD.jpg 1
https://pbs.twimg.com/ext_tw_video_thumb/807106774843039744/pu/img/8XZg1xW35Xp2J6JW.jpg 1
https://pbs.twimg.com/media/CU1zsMSUAAAS0qW.jpg 1
https://pbs.twimg.com/media/Cbs3DOAXIAAp3Bd.jpg 1
https://pbs.twimg.com/media/Cveg1-NXgAASaaT.jpg 1
https://pbs.twimg.com/media/ChK1tdBWwAQ1flD.jpg 1
https://pbs.twimg.com/media/CwiuEJmW8AAZnit.jpg 1
https://pbs.twimg.com/media/CU3mITUWIAAfyQS.jpg 1
https://pbs.twimg.com/media/CZhn-QAWwAASQan.jpg 1
https://pbs.twimg.com/media/CmoPdmHW8AAi8BI.jpg 1
https://pbs.twimg.com/media/Cx5R8wPVEAALa9r.jpg 1
https://pbs.twimg.com/media/CpmyNumW8AAAJGj.jpg 1
https://pbs.twimg.com/media/Cs_DYr1XEAA54Pu.jpg 1
https://pbs.twimg.com/media/C12whDoVEAALRxa.jpg 1
https://pbs.twimg.com/media/CtVAvX-WIAAcGTf.jpg 1
https://pbs.twimg.com/media/C4KHj-nWQAA3poV.jpg 1
https://pbs.twimg.com/media/CrXhIqBW8AA6Bse.jpg 1
https://pbs.twimg.com/media/CVMOlMiWwAA4Yxl.jpg 1
https://pbs.twimg.com/media/C12x-JTVIAAzdfl.jpg 1
https://pbs.twimg.com/media/C2oRbOuWEAAbVSl.jpg 1
https://pbs.twimg.com/media/CWza7kpWcAAdYLc.jpg 1
https://pbs.twimg.com/media/CsGnz64WYAEIDHJ.jpg 1
https://pbs.twimg.com/media/C2kzTGxWEAEOpPL.jpg 1
https://pbs.twimg.com/media/CiibOMzUYAA9Mxz.jpg 1
https://pbs.twimg.com/media/CwS4aqZXUAAe3IO.jpg 1
Name: jpg_url, Length: 66, dtype: int64
print(image_predictions[image_predictions['jpg_url'].isnull()]['jpg_url'].value_counts()) #图片链接验证没有为空的。不用处理
Series([], Name: jpg_url, dtype: int64)
image_predictions[(image_predictions['p1_dog']==False)&(image_predictions['p2_dog']==False)&(image_predictions['p3_dog']==False) ]
# 11) 质量问题: image_predictions 中预测完全不属于狗的 数据有324条,需要处理删除处理
tweet_id | jpg_url | img_num | p1 | p1_conf | p1_dog | p2 | p2_conf | p2_dog | p3 | p3_conf | p3_dog | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
6 | 666051853826850816 | https://pbs.twimg.com/media/CT5KoJ1WoAAJash.jpg | 1 | box_turtle | 0.933012 | False | mud_turtle | 4.588540e-02 | False | terrapin | 1.788530e-02 | False |
17 | 666104133288665088 | https://pbs.twimg.com/media/CT56LSZWoAAlJj2.jpg | 1 | hen | 0.965932 | False | cock | 3.391940e-02 | False | partridge | 5.206580e-05 | False |
18 | 666268910803644416 | https://pbs.twimg.com/media/CT8QCd1WEAADXws.jpg | 1 | desktop_computer | 0.086502 | False | desk | 8.554740e-02 | False | bookcase | 7.947970e-02 | False |
21 | 666293911632134144 | https://pbs.twimg.com/media/CT8mx7KW4AEQu8N.jpg | 1 | three-toed_sloth | 0.914671 | False | otter | 1.525000e-02 | False | great_grey_owl | 1.320720e-02 | False |
25 | 666362758909284353 | https://pbs.twimg.com/media/CT9lXGsUcAAyUFt.jpg | 1 | guinea_pig | 0.996496 | False | skunk | 2.402450e-03 | False | hamster | 4.608630e-04 | False |
29 | 666411507551481857 | https://pbs.twimg.com/media/CT-RugiWIAELEaq.jpg | 1 | coho | 0.404640 | False | barracouta | 2.714850e-01 | False | gar | 1.899450e-01 | False |
45 | 666786068205871104 | https://pbs.twimg.com/media/CUDmZIkWcAAIPPe.jpg | 1 | snail | 0.999888 | False | slug | 5.514170e-05 | False | acorn | 2.625800e-05 | False |
50 | 666837028449972224 | https://pbs.twimg.com/media/CUEUva1WsAA2jPb.jpg | 1 | triceratops | 0.442113 | False | armadillo | 1.140710e-01 | False | common_iguana | 4.325530e-02 | False |
51 | 666983947667116034 | https://pbs.twimg.com/media/CUGaXDhW4AY9JUH.jpg | 1 | swab | 0.589446 | False | chain_saw | 1.901420e-01 | False | wig | 3.450970e-02 | False |
53 | 667012601033924608 | https://pbs.twimg.com/media/CUG0bC0U8AAw2su.jpg | 1 | hyena | 0.987230 | False | African_hunting_dog | 1.260080e-02 | False | coyote | 5.735010e-05 | False |
56 | 667065535570550784 | https://pbs.twimg.com/media/CUHkkJpXIAA2w3n.jpg | 1 | jigsaw_puzzle | 0.560001 | False | doormat | 1.032590e-01 | False | space_heater | 4.256800e-02 | False |
69 | 667188689915760640 | https://pbs.twimg.com/media/CUJUk2iWUAAVtOv.jpg | 1 | vacuum | 0.335830 | False | swab | 2.652780e-01 | False | toilet_tissue | 1.407030e-01 | False |
73 | 667369227918143488 | https://pbs.twimg.com/media/CUL4xR9UkAEdlJ6.jpg | 1 | teddy | 0.709545 | False | bath_towel | 1.272850e-01 | False | Christmas_stocking | 2.856750e-02 | False |
77 | 667437278097252352 | https://pbs.twimg.com/media/CUM2qWaWoAUZ06L.jpg | 1 | porcupine | 0.989154 | False | bath_towel | 6.300490e-03 | False | badger | 9.663400e-04 | False |
78 | 667443425659232256 | https://pbs.twimg.com/media/CUM8QZwW4AAVsBl.jpg | 1 | goose | 0.980815 | False | drake | 6.917770e-03 | False | hen | 5.255170e-03 | False |
93 | 667549055577362432 | https://pbs.twimg.com/media/CUOcVCwWsAERUKY.jpg | 1 | electric_fan | 0.984377 | False | spotlight | 7.736710e-03 | False | lampshade | 1.901230e-03 | False |
94 | 667550882905632768 | https://pbs.twimg.com/media/CUObvUJVEAAnYPF.jpg | 1 | web_site | 0.998258 | False | dishwasher | 2.010840e-04 | False | oscilloscope | 1.417360e-04 | False |
96 | 667724302356258817 | https://pbs.twimg.com/media/CUQ7tv3W4AA3KlI.jpg | 1 | ibex | 0.619098 | False | bighorn | 1.251190e-01 | False | ram | 7.467320e-02 | False |
98 | 667766675769573376 | https://pbs.twimg.com/media/CURiQMnUAAAPT2M.jpg | 1 | fire_engine | 0.883493 | False | tow_truck | 7.473390e-02 | False | jeep | 1.277260e-02 | False |
100 | 667782464991965184 | https://pbs.twimg.com/media/CURwm3cUkAARcO6.jpg | 1 | lorikeet | 0.466149 | False | hummingbird | 8.301100e-02 | False | African_grey | 5.424740e-02 | False |
106 | 667866724293877760 | https://pbs.twimg.com/media/CUS9PlUWwAANeAD.jpg | 1 | jigsaw_puzzle | 1.000000 | False | prayer_rug | 1.011300e-08 | False | doormat | 1.740170e-10 | False |
107 | 667873844930215936 | https://pbs.twimg.com/media/CUTDtyGXIAARxus.jpg | 1 | common_iguana | 0.999647 | False | frilled_lizard | 1.811500e-04 | False | African_chameleon | 1.283570e-04 | False |
112 | 667911425562669056 | https://pbs.twimg.com/media/CUTl5m1WUAAabZG.jpg | 1 | frilled_lizard | 0.257695 | False | ox | 2.351600e-01 | False | triceratops | 8.531690e-02 | False |
115 | 667937095915278337 | https://pbs.twimg.com/media/CUT9PuQWwAABQv7.jpg | 1 | hamster | 0.172078 | False | guinea_pig | 9.492420e-02 | False | Band_Aid | 5.999520e-02 | False |
117 | 668142349051129856 | https://pbs.twimg.com/media/CUW37BzWsAAlJlN.jpg | 1 | Angora | 0.918834 | False | hen | 3.779340e-02 | False | wood_rabbit | 1.101490e-02 | False |
118 | 668154635664932864 | https://pbs.twimg.com/media/CUXDGR2WcAAUQKz.jpg | 1 | Arctic_fox | 0.473584 | False | wallaby | 2.614110e-01 | False | white_wolf | 8.094780e-02 | False |
123 | 668226093875376128 | https://pbs.twimg.com/media/CUYEFlQXAAUkPGm.jpg | 1 | trombone | 0.390339 | False | cornet | 3.141490e-01 | False | French_horn | 2.551820e-01 | False |
130 | 668291999406125056 | https://pbs.twimg.com/media/CUZABzGW4AE5F0k.jpg | 1 | web_site | 0.995535 | False | skunk | 1.363490e-03 | False | badger | 6.856500e-04 | False |
132 | 668466899341221888 | https://pbs.twimg.com/media/CUbfGbbWoAApZth.jpg | 1 | shopping_basket | 0.398361 | False | hamper | 3.632220e-01 | False | bassinet | 8.417350e-02 | False |
140 | 668544745690562560 | https://pbs.twimg.com/media/CUcl5jeWsAA6ufS.jpg | 1 | bearskin | 0.427870 | False | bow | 2.588580e-01 | False | panpipe | 2.156260e-02 | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1839 | 837482249356513284 | https://pbs.twimg.com/media/C59VqMUXEAAzldG.jpg | 2 | birdhouse | 0.541196 | False | can_opener | 1.210940e-01 | False | carton | 5.613670e-02 | False |
1844 | 838916489579200512 | https://pbs.twimg.com/media/C6RkiQZUsAAM4R4.jpg | 2 | web_site | 0.993651 | False | monitor | 1.405900e-03 | False | envelope | 1.093090e-03 | False |
1847 | 839290600511926273 | https://pbs.twimg.com/media/C6XBt9XXEAEEW9U.jpg | 1 | web_site | 0.670892 | False | monitor | 1.015650e-01 | False | screen | 7.530610e-02 | False |
1851 | 840370681858686976 | https://pbs.twimg.com/media/C6mYrK0UwAANhep.jpg | 1 | teapot | 0.981819 | False | cup | 1.402580e-02 | False | coffeepot | 2.420540e-03 | False |
1853 | 840696689258311684 | https://pbs.twimg.com/media/C6rBLenU0AAr8MN.jpg | 1 | web_site | 0.841768 | False | rule | 7.087310e-03 | False | envelope | 6.820300e-03 | False |
1869 | 844580511645339650 | https://pbs.twimg.com/media/C7iNfq1W0AAcbsR.jpg | 1 | washer | 0.903064 | False | dishwasher | 3.248900e-02 | False | printer | 1.645620e-02 | False |
1886 | 847962785489326080 | https://pbs.twimg.com/media/C8SRpHNUIAARB3j.jpg | 1 | sea_lion | 0.882654 | False | mink | 6.688020e-02 | False | otter | 2.567870e-02 | False |
1887 | 847971574464610304 | https://pbs.twimg.com/media/C8SZH1EWAAAIRRF.jpg | 1 | coffee_mug | 0.633652 | False | cup | 2.733920e-01 | False | toilet_tissue | 6.665580e-02 | False |
1891 | 849051919805034497 | https://pbs.twimg.com/media/C8hwNxbXYAAwyVG.jpg | 1 | fountain | 0.997509 | False | American_black_bear | 1.413120e-03 | False | sundial | 6.811150e-04 | False |
1892 | 849336543269576704 | https://pbs.twimg.com/media/C8lzFC4XcAAQxB4.jpg | 1 | patio | 0.521788 | False | prison | 1.495440e-01 | False | restaurant | 2.715260e-02 | False |
1900 | 851464819735769094 | https://pbs.twimg.com/media/C9ECujZXsAAPCSM.jpg | 2 | web_site | 0.919649 | False | menu | 2.630610e-02 | False | crossword_puzzle | 3.481510e-03 | False |
1902 | 851861385021730816 | https://pbs.twimg.com/media/C8W6sY_W0AEmttW.jpg | 1 | pencil_box | 0.662183 | False | purse | 6.650550e-02 | False | pillow | 4.472530e-02 | False |
1905 | 852226086759018497 | https://pbs.twimg.com/ext_tw_video_thumb/85222... | 1 | prison | 0.352793 | False | dishwasher | 1.107230e-01 | False | file | 9.411200e-02 | False |
1906 | 852311364735569921 | https://pbs.twimg.com/media/C9QEqZ7XYAIR7fS.jpg | 1 | barbell | 0.971581 | False | dumbbell | 2.841790e-02 | False | go-kart | 5.595040e-07 | False |
1910 | 853299958564483072 | https://pbs.twimg.com/media/C9eHyF7XgAAOxPM.jpg | 1 | grille | 0.652280 | False | beach_wagon | 1.128460e-01 | False | convertible | 8.625230e-02 | False |
1931 | 859074603037188101 | https://pbs.twimg.com/media/C-wLyufW0AA546I.jpg | 1 | revolver | 0.190292 | False | projectile | 1.490640e-01 | False | fountain | 6.604660e-02 | False |
1936 | 860184849394610176 | https://pbs.twimg.com/media/C-_9jWWUwAAnwkd.jpg | 1 | chimpanzee | 0.267612 | False | gorilla | 1.042930e-01 | False | orangutan | 5.990750e-02 | False |
1937 | 860276583193509888 | https://pbs.twimg.com/media/C_BQ_NlVwAAgYGD.jpg | 1 | lakeside | 0.312299 | False | dock | 1.598420e-01 | False | canoe | 7.079450e-02 | False |
1940 | 860924035999428608 | https://pbs.twimg.com/media/C_KVJjDXsAEUCWn.jpg | 2 | envelope | 0.933016 | False | oscilloscope | 1.259140e-02 | False | paper_towel | 1.117850e-02 | False |
1946 | 862457590147678208 | https://pbs.twimg.com/media/C_gQmaTUMAAPYSS.jpg | 1 | home_theater | 0.496348 | False | studio_couch | 1.672560e-01 | False | barber_chair | 5.262500e-02 | False |
1953 | 863907417377173506 | https://pbs.twimg.com/media/C_03NPeUQAAgrMl.jpg | 1 | marmot | 0.358828 | False | meerkat | 1.747030e-01 | False | weasel | 1.234850e-01 | False |
1956 | 864873206498414592 | https://pbs.twimg.com/media/DAClmHkXcAA1kSv.jpg | 2 | pole | 0.478616 | False | lakeside | 1.141820e-01 | False | wreck | 5.592650e-02 | False |
1975 | 870063196459192321 | https://pbs.twimg.com/media/DBMV3NnXUAAm0Pp.jpg | 1 | comic_book | 0.534409 | False | envelope | 2.807220e-01 | False | book_jacket | 4.378550e-02 | False |
1979 | 870804317367881728 | https://pbs.twimg.com/media/DBW35ZsVoAEWZUU.jpg | 1 | home_theater | 0.168290 | False | sandbar | 9.804040e-02 | False | television | 7.972940e-02 | False |
2012 | 879050749262655488 | https://pbs.twimg.com/media/DDMD_phXoAQ1qf0.jpg | 1 | tabby | 0.311861 | False | window_screen | 1.691230e-01 | False | Egyptian_cat | 1.329320e-01 | False |
2021 | 880935762899988482 | https://pbs.twimg.com/media/DDm2Z5aXUAEDS2u.jpg | 1 | street_sign | 0.251801 | False | umbrella | 1.151230e-01 | False | traffic_light | 6.953380e-02 | False |
2022 | 881268444196462592 | https://pbs.twimg.com/media/DDrk-f9WAAI-WQv.jpg | 1 | tusker | 0.473303 | False | Indian_elephant | 2.456460e-01 | False | ibex | 5.566070e-02 | False |
2046 | 886680336477933568 | https://pbs.twimg.com/media/DE4fEDzWAAAyHMM.jpg | 1 | convertible | 0.738995 | False | sports_car | 1.399520e-01 | False | car_wheel | 4.417270e-02 | False |
2052 | 887517139158093824 | https://pbs.twimg.com/ext_tw_video_thumb/88751... | 1 | limousine | 0.130432 | False | tow_truck | 2.917540e-02 | False | shopping_cart | 2.632080e-02 | False |
2074 | 892420643555336193 | https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg | 1 | orange | 0.097049 | False | bagel | 8.585110e-02 | False | banana | 7.611000e-02 | False |
324 rows × 12 columns
# 读取出各个文件。
tweetIdList = [] #这个是装提取出来的id的
tweetReTList = [] #这个是转推次数的提取
tweetFavList = [] # 喜欢数的提取。
tweetTextList =[]
with open('tweet_json.txt') as json_file: #直接打开文件的方式打开失败,于是改用按行读取的方式
for oneLine in json_file.readlines():
tempDic= json.loads(oneLine) #每部分都是字典的东西放在这儿。
tempID = tempDic['id_str'] #根据需求分别提取推文用户id,喜欢数,转发数
tempRe = tempDic['retweet_count']
tempFa = tempDic['favorite_count']
tweetIdList.append(tempID)
tweetReTList.append(tempRe)
tweetFavList.append(tempFa)
tweet_json = pd.DataFrame({'tweet_id':tweetIdList,'retweet_count':tweetReTList,'favorite_count':tweetFavList})
tweet_json #从tweet_json.txt 中提取出 用户id,喜欢数,转发数
# 12)发现问题 ,整洁度问题: 三个表格有相同tweet_id的字段,可以合并成一个进行操作。
tweet_id | retweet_count | favorite_count | |
---|---|---|---|
0 | 892420643555336193 | 8842 | 39492 |
1 | 892177421306343426 | 6480 | 33786 |
2 | 891815181378084864 | 4301 | 25445 |
3 | 891689557279858688 | 8925 | 42863 |
4 | 891327558926688256 | 9721 | 41016 |
5 | 891087950875897856 | 3240 | 20548 |
6 | 890971913173991426 | 2142 | 12053 |
7 | 890729181411237888 | 19548 | 66596 |
8 | 890609185150312448 | 4403 | 28187 |
9 | 890240255349198849 | 7684 | 32467 |
10 | 890006608113172480 | 7584 | 31127 |
11 | 889880896479866881 | 5116 | 28208 |
12 | 889665388333682689 | 8502 | 38745 |
13 | 889638837579907072 | 4705 | 27633 |
14 | 889531135344209921 | 2309 | 15329 |
15 | 889278841981685760 | 5635 | 25712 |
16 | 888917238123831296 | 4681 | 29555 |
17 | 888804989199671297 | 4535 | 26021 |
18 | 888554962724278272 | 3722 | 20267 |
19 | 888078434458587136 | 3637 | 22144 |
20 | 887705289381826560 | 5584 | 30690 |
21 | 887517139158093824 | 12053 | 46940 |
22 | 887473957103951883 | 18813 | 70007 |
23 | 887343217045368832 | 10713 | 34223 |
24 | 887101392804085760 | 6147 | 31045 |
25 | 886983233522544640 | 8045 | 35786 |
26 | 886736880519319552 | 3420 | 12286 |
27 | 886680336477933568 | 4597 | 22802 |
28 | 886366144734445568 | 3297 | 21488 |
29 | 886267009285017600 | 4 | 117 |
... | ... | ... | ... |
2322 | 666411507551481857 | 337 | 457 |
2323 | 666407126856765440 | 43 | 113 |
2324 | 666396247373291520 | 91 | 171 |
2325 | 666373753744588802 | 99 | 194 |
2326 | 666362758909284353 | 590 | 801 |
2327 | 666353288456101888 | 76 | 228 |
2328 | 666345417576210432 | 146 | 308 |
2329 | 666337882303524864 | 96 | 203 |
2330 | 666293911632134144 | 365 | 519 |
2331 | 666287406224695296 | 71 | 152 |
2332 | 666273097616637952 | 81 | 183 |
2333 | 666268910803644416 | 37 | 108 |
2334 | 666104133288665088 | 6835 | 14703 |
2335 | 666102155909144576 | 15 | 81 |
2336 | 666099513787052032 | 73 | 160 |
2337 | 666094000022159362 | 78 | 168 |
2338 | 666082916733198337 | 47 | 121 |
2339 | 666073100786774016 | 173 | 334 |
2340 | 666071193221509120 | 67 | 154 |
2341 | 666063827256086533 | 230 | 494 |
2342 | 666058600524156928 | 61 | 117 |
2343 | 666057090499244032 | 146 | 304 |
2344 | 666055525042405380 | 261 | 449 |
2345 | 666051853826850816 | 877 | 1250 |
2346 | 666050758794694657 | 60 | 136 |
2347 | 666049248165822465 | 41 | 111 |
2348 | 666044226329800704 | 147 | 309 |
2349 | 666033412701032449 | 47 | 128 |
2350 | 666029285002620928 | 48 | 132 |
2351 | 666020888022790149 | 530 | 2528 |
2352 rows × 3 columns
tweet_json.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2352 entries, 0 to 2351
Data columns (total 3 columns):
tweet_id 2352 non-null object
retweet_count 2352 non-null int64
favorite_count 2352 non-null int64
dtypes: int64(2), object(1)
memory usage: 55.2+ KB
3.清洗
将上面评估时发现的数据问题汇总如下
问题定义
质量问题:
-
1)质量问题: tweet_id 用户id应该为字符串类型,而不是int64 数值类型,所以需要修改。
-
2)质量问题: expanded_urls是推文的链接地址,存在缺失值,缺失的行可能已经失效,需要处理
-
3)质量问题: source 里面包含html的标签,可以进一步提取出去html标签的文本内容,表示来源。
-
4)质量问题: jpg_url是发现有重复值,需要清理。
-
5)retweeted_status_user_id 不为空的是 转发的推文 ,需要清理掉,只留下非转发的推文
-
6)质量问题: 质量问题:狗的名字可以相同,但是 有异常的 a,None the an
-
7)质量问题: 发现in_reply_to_status_id in_reply_to_user_id 都只有78条,基本都是空值,需要清理
-
8)质量问题: image_predictions 三次预测中完全不属于狗的 数据有324条,需要处理删除处理
-
9)质量问题: 评分的分母除了10和10的倍数的,还有少量不为10,如11,2,7需要重新检查或者重新提取
整洁性问题:
- 10)三个表格有相同tweet_id的字段,可以合并成一个进行操作。
-
- expanded_urls 中有些列里面有多个相同的expanded_urls 观察到。
-
- 整洁性问题:狗的地位stage(体型而定的) 应该为分类数据,应该放在同一列中
下面总体按照顺序进行处理,但是因为有些处理完了才能继续处理后面的,所以些许顺序对不上,如下tweet_id因为合并需要格式相同,故提前处理,不过总共是可以确认处理完了8个质量问题,和2个整洁性问题的。
# 展开所有的column
pd.options.display.max_columns=500
pd.set_option('max_colwidth',200)
#定义: 1)质量问题: tweet_id 用户id应该为字符串类型,而不是int64 数值类型,所以需要修改。
#定义: 9)整洁性问题: 合并三个表
# 编码 解决tweet_id 数据类型不对和 合并三个数据集
tempTable = pd.merge(twitter_achieve,image_predictions,on="tweet_id") #类型需要转换成相同的才可以。
tempTable_clean = tempTable.copy() #复制一份
tempTable_clean['tweet_id'] = tempTable_clean['tweet_id'].apply(str) #把前两列合并成的tweet_id 格式转成字符串类型。
tempTable_all = pd.merge(tempTable_clean,tweet_json,on="tweet_id")
final_Data_clean = tempTable_all.copy() #复制一份,不改变原来数据集
final_Data_clean.info()
#测试 检查数据集格式tweet_id 是否为字符串类型,已确认修改成功;合并三个表,已确认,成功。
final_Data_clean.head(4)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2073 entries, 0 to 2072
Data columns (total 30 columns):
tweet_id 2073 non-null object
in_reply_to_status_id 23 non-null float64
in_reply_to_user_id 23 non-null float64
timestamp 2073 non-null object
source 2073 non-null object
text 2073 non-null object
retweeted_status_id 79 non-null float64
retweeted_status_user_id 79 non-null float64
retweeted_status_timestamp 79 non-null object
expanded_urls 2073 non-null object
rating_numerator 2073 non-null int64
rating_denominator 2073 non-null int64
name 2073 non-null object
doggo 2073 non-null object
floofer 2073 non-null object
pupper 2073 non-null object
puppo 2073 non-null object
jpg_url 2073 non-null object
img_num 2073 non-null int64
p1 2073 non-null object
p1_conf 2073 non-null float64
p1_dog 2073 non-null bool
p2 2073 non-null object
p2_conf 2073 non-null float64
p2_dog 2073 non-null bool
p3 2073 non-null object
p3_conf 2073 non-null float64
p3_dog 2073 non-null bool
retweet_count 2073 non-null int64
favorite_count 2073 non-null int64
dtypes: bool(3), float64(7), int64(5), object(15)
memory usage: 459.5+ KB
tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | jpg_url | img_num | p1 | p1_conf | p1_dog | p2 | p2_conf | p2_dog | p3 | p3_conf | p3_dog | retweet_count | favorite_count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 892420643555336193 | NaN | NaN | 2017-08-01 16:23:56 +0000 | <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> | This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU | NaN | NaN | NaN | https://twitter.com/dog_rates/status/892420643555336193/photo/1 | 13 | 10 | Phineas | None | None | None | None | https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg | 1 | orange | 0.097049 | False | bagel | 0.085851 | False | banana | 0.076110 | False | 8842 | 39492 |
1 | 892177421306343426 | NaN | NaN | 2017-08-01 00:17:27 +0000 | <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> | This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https://t.co/0Xxu71qeIV | NaN | NaN | NaN | https://twitter.com/dog_rates/status/892177421306343426/photo/1 | 13 | 10 | Tilly | None | None | None | None | https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg | 1 | Chihuahua | 0.323581 | True | Pekinese | 0.090647 | True | papillon | 0.068957 | True | 6480 | 33786 |
2 | 891815181378084864 | NaN | NaN | 2017-07-31 00:18:03 +0000 | <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> | This is Archie. He is a rare Norwegian Pouncing Corgo. Lives in the tall grass. You never know when one may strike. 12/10 https://t.co/wUnZnhtVJB | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891815181378084864/photo/1 | 12 | 10 | Archie | None | None | None | None | https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg | 1 | Chihuahua | 0.716012 | True | malamute | 0.078253 | True | kelpie | 0.031379 | True | 4301 | 25445 |
3 | 891689557279858688 | NaN | NaN | 2017-07-30 15:58:51 +0000 | <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> | This is Darla. She commenced a snooze mid meal. 13/10 happens to the best of us https://t.co/tD36da7qLQ | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891689557279858688/photo/1 | 13 | 10 | Darla | None | None | None | None | https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg | 1 | paper_towel | 0.170278 | False | Labrador_retriever | 0.168086 | True | spatula | 0.040836 | False | 8925 | 42863 |
# 定义: 2)质量问题: expanded_urls是推文的链接地址,存在缺失值,缺失的行可能已经失效,需要处理
#编码
final_Data_clean = final_Data_clean[final_Data_clean['expanded_urls'].notnull()] #把缺失值的 expanded_urls 去掉。留下没有缺失的。
#测试
final_Data_clean.expanded_urls.isnull().value_counts() #验证完毕,不再出现含有缺失值的 expanded_urls了。
False 2073
Name: expanded_urls, dtype: int64
# 定义:3)质量问题:source 里面包含html的标签,可以进一步提取
#编码
urlString = '<a href="https://www.baidu.com/link?url=A3b9CWrhoCv4Oxw6z40oAU2_qNwN9756AJwaCLaPmBpK0bFjU8Rjv2LwWLL7fvHwgyq4cwaMfgO6_as6CpzUg_&wd=&eqid=be1ccee200026e65000000065c81f06b" target="_blank"><em>Beautiful Soup</em> Documentation — <em>Beautiful Soup</em> 4.4.0 ...</a>'
from bs4 import BeautifulSoup
def phraseHtml(string): #编写函数用来提取source中a标签中的链接内容
soup = BeautifulSoup(string,'lxml') #使用BeautifulSoup来解析html
url = soup.find("a").string
return url
final_Data_clean['source'] = final_Data_clean['source'].apply(phraseHtml) # 应用函数提取出a标签内url 。
#测试: 检查source是否提取成功url,已验证,提取成功。
print(final_Data_clean['source'].value_counts())
final_Data_clean.head(4)
Twitter for iPhone 2032
Twitter Web Client 30
TweetDeck 11
Name: source, dtype: int64
tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | jpg_url | img_num | p1 | p1_conf | p1_dog | p2 | p2_conf | p2_dog | p3 | p3_conf | p3_dog | retweet_count | favorite_count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 892420643555336193 | NaN | NaN | 2017-08-01 16:23:56 +0000 | Twitter for iPhone | This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU | NaN | NaN | NaN | https://twitter.com/dog_rates/status/892420643555336193/photo/1 | 13 | 10 | Phineas | None | None | None | None | https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg | 1 | orange | 0.097049 | False | bagel | 0.085851 | False | banana | 0.076110 | False | 8842 | 39492 |
1 | 892177421306343426 | NaN | NaN | 2017-08-01 00:17:27 +0000 | Twitter for iPhone | This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https://t.co/0Xxu71qeIV | NaN | NaN | NaN | https://twitter.com/dog_rates/status/892177421306343426/photo/1 | 13 | 10 | Tilly | None | None | None | None | https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg | 1 | Chihuahua | 0.323581 | True | Pekinese | 0.090647 | True | papillon | 0.068957 | True | 6480 | 33786 |
2 | 891815181378084864 | NaN | NaN | 2017-07-31 00:18:03 +0000 | Twitter for iPhone | This is Archie. He is a rare Norwegian Pouncing Corgo. Lives in the tall grass. You never know when one may strike. 12/10 https://t.co/wUnZnhtVJB | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891815181378084864/photo/1 | 12 | 10 | Archie | None | None | None | None | https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg | 1 | Chihuahua | 0.716012 | True | malamute | 0.078253 | True | kelpie | 0.031379 | True | 4301 | 25445 |
3 | 891689557279858688 | NaN | NaN | 2017-07-30 15:58:51 +0000 | Twitter for iPhone | This is Darla. She commenced a snooze mid meal. 13/10 happens to the best of us https://t.co/tD36da7qLQ | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891689557279858688/photo/1 | 13 | 10 | Darla | None | None | None | None | https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg | 1 | paper_towel | 0.170278 | False | Labrador_retriever | 0.168086 | True | spatula | 0.040836 | False | 8925 | 42863 |
# 定义:4)质量问题: jpg_url是发现有重复值,需要清理。
#编码
print(final_Data_clean[final_Data_clean['jpg_url'].duplicated() ==True].iloc[:,0].size)# #检查到有79条之行的jpg_url是重复的。
final_Data_clean = final_Data_clean[final_Data_clean['jpg_url'].duplicated() ==False] #直接保留没重复的jpg_url的数据
#测试
final_Data_clean[final_Data_clean['jpg_url'].duplicated() ==True] #正确除去了包含jpg_url重复的内容
64
tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | jpg_url | img_num | p1 | p1_conf | p1_dog | p2 | p2_conf | p2_dog | p3 | p3_conf | p3_dog | retweet_count | favorite_count |
---|
# 定义: 5) retweeted_status_user_id 不为空的是 转发的推文 ,需要清理掉,只留下非转发的推文
#编码
final_Data_clean = final_Data_clean[final_Data_clean['retweeted_status_user_id'].isnull()] # 把去掉的改过去,
#测试,验证成功,已经删除了转发的推特文。
final_Data_clean[final_Data_clean['retweeted_status_user_id'].notnull()] #
tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | jpg_url | img_num | p1 | p1_conf | p1_dog | p2 | p2_conf | p2_dog | p3 | p3_conf | p3_dog | retweet_count | favorite_count |
---|
# 定义:6)质量问题:狗的名字可以相同,但是 有异常的 a,None the an
#编码
final_Data_clean['name'] = final_Data_clean['name'].replace(['a','None','the','an'],np.nan) # 把name中的 a None the 用NaN进行填充
#测试,已除去 a None the
final_Data_clean['name'].value_counts() #这样就看不到空值了咯。
Charlie 11
Cooper 10
Lucy 10
Oliver 10
Winston 8
Sadie 8
Penny 8
Tucker 8
Toby 7
Daisy 7
Stanley 6
Bella 6
Koda 6
Jax 6
Lola 6
Oscar 5
Leo 5
Chester 5
Louis 5
Buddy 5
Phil 4
Maggie 4
Duke 4
Gus 4
Rusty 4
Brody 4
Scout 4
Milo 4
Archie 4
Dexter 4
..
Blipson 1
Jangle 1
Taco 1
Willy 1
Pepper 1
Pipsy 1
Aja 1
Noah 1
Pip 1
Sailer 1
Clifford 1
Bertson 1
Thor 1
Julius 1
Flash 1
Binky 1
Ralphus 1
Rover 1
Shiloh 1
Margo 1
Tito 1
Brownie 1
my 1
Colin 1
Buckley 1
Alexander 1
Kulet 1
Keurig 1
Trigger 1
Jarod 1
Name: name, Length: 909, dtype: int64
# 定义 :7)质量问题: 发现in_reply_to_status_id in_reply_to_user_id 都只有78条,基本都是空值,需要清理
#编码
final_Data_clean= final_Data_clean.drop(['in_reply_to_status_id','in_reply_to_user_id'],axis='columns')
#测试,以通过,去掉了不需要的这两列
final_Data_clean.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1930 entries, 0 to 2072
Data columns (total 28 columns):
tweet_id 1930 non-null object
timestamp 1930 non-null object
source 1930 non-null object
text 1930 non-null object
retweeted_status_id 0 non-null float64
retweeted_status_user_id 0 non-null float64
retweeted_status_timestamp 0 non-null object
expanded_urls 1930 non-null object
rating_numerator 1930 non-null int64
rating_denominator 1930 non-null int64
name 1333 non-null object
doggo 1930 non-null object
floofer 1930 non-null object
pupper 1930 non-null object
puppo 1930 non-null object
jpg_url 1930 non-null object
img_num 1930 non-null int64
p1 1930 non-null object
p1_conf 1930 non-null float64
p1_dog 1930 non-null bool
p2 1930 non-null object
p2_conf 1930 non-null float64
p2_dog 1930 non-null bool
p3 1930 non-null object
p3_conf 1930 non-null float64
p3_dog 1930 non-null bool
retweet_count 1930 non-null int64
favorite_count 1930 non-null int64
dtypes: bool(3), float64(5), int64(5), object(15)
memory usage: 397.7+ KB
#定义 8)质量问题: image_predictions 中预测完全不属于狗的 数据有324条,需要处理删除处理
#编码
final_Data_clean[(final_Data_clean['p1_dog']==False)&(final_Data_clean['p2_dog']==False)&(final_Data_clean['p3_dog']==False)] #图片识别为不是狗的,需要排除掉
final_Data_clean = final_Data_clean[(final_Data_clean['p1_dog']==True)|(final_Data_clean['p2_dog']==True)|(final_Data_clean['p3_dog']==True)] #这里这儿只保留识别成是狗的种类的
#测试,验证成功,已经除去了三次识别都不是狗的数据。
final_Data_clean[(final_Data_clean['p1_dog']==False)&(final_Data_clean['p2_dog']==False)&(final_Data_clean['p3_dog']==False)]
tweet_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | jpg_url | img_num | p1 | p1_conf | p1_dog | p2 | p2_conf | p2_dog | p3 | p3_conf | p3_dog | retweet_count | favorite_count |
---|
# 定义:10 )评分的分母除了10和10的倍数的,还有少量不为10,如11,2,7需要重新检查或者重新提取 <第9个合并三个表在1)那儿一起解决了>
#编码
print(final_Data_clean[final_Data_clean['rating_denominator']!=10].iloc[:,0].size) #非0 数量不算多,所以直接目测手动处理
final_Data_clean[final_Data_clean['rating_denominator']!=10][['text','rating_numerator','rating_denominator']]
17
text | rating_numerator | rating_denominator | |
---|---|---|---|
344 | The floofs have been released I repeat the floofs have been released. 84/70 https://t.co/NIYC820tmd | 84 | 70 |
414 | Meet Sam. She smiles 24/7 & secretly aspires to be a reindeer. \nKeep Sam smiling by clicking and sharing this link:\nhttps://t.co/98tB8y7y7t https://t.co/LouL5vdvxx | 24 | 7 |
734 | Why does this never happen at my front door... 165/150 https://t.co/HmwrdfEfUE | 165 | 150 |
876 | After so many requests, this is Bretagne. She was the last surviving 9/11 search dog, and our second ever 14/10. RIP https://t.co/XAVDNDaVgQ | 9 | 11 |
967 | Happy 4/20 from the squad! 13/10 for all https://t.co/eV1diwds8a | 4 | 20 |
1001 | This is Bluebert. He just saw that both #FinalFur match ups are split 50/50. Amazed af. 11/10 https://t.co/Kky1DPG4iq | 50 | 50 |
1022 | Happy Saturday here's 9 puppers on a bench. 99/90 good work everybody https://t.co/mpvaVxKmc1 | 99 | 90 |
1047 | Here's a brigade of puppers. All look very prepared for whatever happens next. 80/80 https://t.co/0eb7R1Om12 | 80 | 80 |
1065 | From left to right:\nCletus, Jerome, Alejandro, Burp, & Titson\nNone know where camera is. 45/50 would hug all at once https://t.co/sedre1ivTK | 45 | 50 |
1131 | Here is a whole flock of puppers. 60/50 I'll take the lot https://t.co/9dpcw6MdWa | 60 | 50 |
1207 | Happy Wednesday here's a bucket of pups. 44/40 would pet all at once https://t.co/HppvrYuamZ | 44 | 40 |
1379 | Two sneaky puppers were not initially seen, moving the rating to 143/130. Please forgive us. Thank you https://t.co/kRK51Y5ac3 | 143 | 130 |
1380 | Someone help the girl is being mugged. Several are distracting her while two steal her shoes. Clever puppers 121/110 https://t.co/1zfnTJLt55 | 121 | 110 |
1405 | This is Darrel. He just robbed a 7/11 and is in a high speed police chase. Was just spotted by the helicopter 10/10 https://t.co/7EsP8LmSp5 | 7 | 11 |
1512 | IT'S PUPPERGEDDON. Total of 144/120 ...I think https://t.co/ZanVtAtvIq | 144 | 120 |
1571 | Here we have an entire platoon of puppers. Total score: 88/80 would pet all at once https://t.co/y93p6FLvVw | 88 | 80 |
2052 | This is an Albanian 3 1/2 legged Episcopalian. Loves well-polished hardwood flooring. Penis on the collar. 9/10 https://t.co/d9NcXFKwLv | 1 | 2 |
# 数量不多,故可以直接目测,目测结果,基本上没什么错误
# 出现错误的主要是 id=[876,967,1405,2052] 他们对应的正确的值应该是 values=["14/10",'13/10','10/10 ',' 9/10 '] 所以单独设置就可以
#修改值
final_Data_clean.loc[876,"rating_numerator" ] = 14 #修改分子和分母
final_Data_clean.loc[876,"rating_denominator" ]= 10
final_Data_clean.loc[967,"rating_numerator" ] = 13 #修改分子和分母
final_Data_clean.loc[967,"rating_denominator" ]= 10
final_Data_clean.loc[1405,"rating_numerator" ] =10 #修改分子和分母
final_Data_clean.loc[1405,"rating_denominator" ]= 10
final_Data_clean.loc[2052,"rating_numerator" ]=9 #修改分子和分母 ?因为不是顺序的,所以iloc会对应不上吗,loc,默认的是index
final_Data_clean.loc[2052,"rating_denominator" ]=10
#测试1
final_Data_clean[final_Data_clean['rating_denominator']!=10][['text','rating_numerator','rating_denominator']] #验证修改成功。
text | rating_numerator | rating_denominator | |
---|---|---|---|
344 | The floofs have been released I repeat the floofs have been released. 84/70 https://t.co/NIYC820tmd | 84 | 70 |
414 | Meet Sam. She smiles 24/7 & secretly aspires to be a reindeer. \nKeep Sam smiling by clicking and sharing this link:\nhttps://t.co/98tB8y7y7t https://t.co/LouL5vdvxx | 24 | 7 |
734 | Why does this never happen at my front door... 165/150 https://t.co/HmwrdfEfUE | 165 | 150 |
1001 | This is Bluebert. He just saw that both #FinalFur match ups are split 50/50. Amazed af. 11/10 https://t.co/Kky1DPG4iq | 50 | 50 |
1022 | Happy Saturday here's 9 puppers on a bench. 99/90 good work everybody https://t.co/mpvaVxKmc1 | 99 | 90 |
1047 | Here's a brigade of puppers. All look very prepared for whatever happens next. 80/80 https://t.co/0eb7R1Om12 | 80 | 80 |
1065 | From left to right:\nCletus, Jerome, Alejandro, Burp, & Titson\nNone know where camera is. 45/50 would hug all at once https://t.co/sedre1ivTK | 45 | 50 |
1131 | Here is a whole flock of puppers. 60/50 I'll take the lot https://t.co/9dpcw6MdWa | 60 | 50 |
1207 | Happy Wednesday here's a bucket of pups. 44/40 would pet all at once https://t.co/HppvrYuamZ | 44 | 40 |
1379 | Two sneaky puppers were not initially seen, moving the rating to 143/130. Please forgive us. Thank you https://t.co/kRK51Y5ac3 | 143 | 130 |
1380 | Someone help the girl is being mugged. Several are distracting her while two steal her shoes. Clever puppers 121/110 https://t.co/1zfnTJLt55 | 121 | 110 |
1512 | IT'S PUPPERGEDDON. Total of 144/120 ...I think https://t.co/ZanVtAtvIq | 144 | 120 |
1571 | Here we have an entire platoon of puppers. Total score: 88/80 would pet all at once https://t.co/y93p6FLvVw | 88 | 80 |
# 用分子分母相除,结果存到新列中。
#测试2
final_Data_clean['Final_Grade']=final_Data_clean['rating_numerator']/final_Data_clean['rating_denominator']
final_Data_clean['Final_Grade']=final_Data_clean['Final_Grade'].apply(lambda x: '%.2f'%x ) #结果保留两位小数
final_Data_clean['Final_Grade'].value_counts() #评分结果。
1.20 407
1.00 356
1.10 348
1.30 214
0.90 134
0.80 68
0.70 31
1.40 22
0.60 16
0.50 14
0.40 6
0.30 5
0.20 2
7.50 1
3.43 1
2.70 1
0.00 1
2.60 1
Name: Final_Grade, dtype: int64
# 定义;11)整洁性问题: expanded_urls 中有些列里面有多个相同的expanded_url 观察到。(第9个整洁性问题在一开始的时候就合并解决)
final_Data_clean['expanded_urls'] # 可以看到内部有些,有很多个重复的相同的链接地址,并且之间是用逗号链接的。
#编码
def cleanExpanded_urls(string):
if(string.find(",")!=-1):
return string.split(",")[-1]
else:
return string
final_Data_clean['expanded_urls'] = final_Data_clean['expanded_urls'] .apply(cleanExpanded_urls)
#测试,已经清理完毕
final_Data_clean['expanded_urls']
1 https://twitter.com/dog_rates/status/892177421306343426/photo/1
2 https://twitter.com/dog_rates/status/891815181378084864/photo/1
3 https://twitter.com/dog_rates/status/891689557279858688/photo/1
4 https://twitter.com/dog_rates/status/891327558926688256/photo/1
5 https://twitter.com/dog_rates/status/891087950875897856/photo/1
6 https://twitter.com/dog_rates/status/890971913173991426/photo/1
7 https://twitter.com/dog_rates/status/890729181411237888/photo/1
8 https://twitter.com/dog_rates/status/890609185150312448/photo/1
9 https://twitter.com/dog_rates/status/890240255349198849/photo/1
10 https://twitter.com/dog_rates/status/890006608113172480/photo/1
11 https://twitter.com/dog_rates/status/889880896479866881/photo/1
12 https://twitter.com/dog_rates/status/889665388333682689/photo/1
13 https://twitter.com/dog_rates/status/889638837579907072/photo/1
14 https://twitter.com/dog_rates/status/889531135344209921/photo/1
15 https://twitter.com/dog_rates/status/889278841981685760/video/1
16 https://twitter.com/dog_rates/status/888917238123831296/photo/1
17 https://twitter.com/dog_rates/status/888804989199671297/photo/1
18 https://twitter.com/dog_rates/status/888554962724278272/photo/1
19 https://twitter.com/dog_rates/status/888078434458587136/photo/1
20 https://twitter.com/dog_rates/status/887705289381826560/photo/1
22 https://twitter.com/dog_rates/status/887473957103951883/photo/1
23 https://twitter.com/dog_rates/status/887343217045368832/video/1
24 https://twitter.com/dog_rates/status/887101392804085760/photo/1
25 https://twitter.com/dog_rates/status/886983233522544640/photo/1
26 https://twitter.com/dog_rates/status/886736880519319552/photo/1
28 https://twitter.com/dog_rates/status/886366144734445568/photo/1
29 https://twitter.com/dog_rates/status/886258384151887873/photo/1
30 https://twitter.com/dog_rates/status/885984800019947520/photo/1
31 https://twitter.com/dog_rates/status/885528943205470208/photo/1
33 https://twitter.com/dog_rates/status/885167619883638784/photo/1
...
2037 https://twitter.com/dog_rates/status/666437273139982337/photo/1
2038 https://twitter.com/dog_rates/status/666435652385423360/photo/1
2039 https://twitter.com/dog_rates/status/666430724426358785/photo/1
2040 https://twitter.com/dog_rates/status/666428276349472768/photo/1
2041 https://twitter.com/dog_rates/status/666421158376562688/photo/1
2042 https://twitter.com/dog_rates/status/666418789513326592/photo/1
2044 https://twitter.com/dog_rates/status/666407126856765440/photo/1
2045 https://twitter.com/dog_rates/status/666396247373291520/photo/1
2046 https://twitter.com/dog_rates/status/666373753744588802/photo/1
2048 https://twitter.com/dog_rates/status/666353288456101888/photo/1
2049 https://twitter.com/dog_rates/status/666345417576210432/photo/1
2050 https://twitter.com/dog_rates/status/666337882303524864/photo/1
2052 https://twitter.com/dog_rates/status/666287406224695296/photo/1
2053 https://twitter.com/dog_rates/status/666273097616637952/photo/1
2056 https://twitter.com/dog_rates/status/666102155909144576/photo/1
2057 https://twitter.com/dog_rates/status/666099513787052032/photo/1
2058 https://twitter.com/dog_rates/status/666094000022159362/photo/1
2059 https://twitter.com/dog_rates/status/666082916733198337/photo/1
2060 https://twitter.com/dog_rates/status/666073100786774016/photo/1
2061 https://twitter.com/dog_rates/status/666071193221509120/photo/1
2062 https://twitter.com/dog_rates/status/666063827256086533/photo/1
2063 https://twitter.com/dog_rates/status/666058600524156928/photo/1
2064 https://twitter.com/dog_rates/status/666057090499244032/photo/1
2065 https://twitter.com/dog_rates/status/666055525042405380/photo/1
2067 https://twitter.com/dog_rates/status/666050758794694657/photo/1
2068 https://twitter.com/dog_rates/status/666049248165822465/photo/1
2069 https://twitter.com/dog_rates/status/666044226329800704/photo/1
2070 https://twitter.com/dog_rates/status/666033412701032449/photo/1
2071 https://twitter.com/dog_rates/status/666029285002620928/photo/1
2072 https://twitter.com/dog_rates/status/666020888022790149/photo/1
Name: expanded_urls, Length: 1628, dtype: object
# 定义:12)整洁性问题:狗的地位stage(体型而定的) 应该为分类数据,应该放在同一列中
#编码
final_Data_clean['stage']=final_Data_clean['text'].str.findall("(puppo|doggo|pupper|floofer)").copy()
#因为狗的身份来自于text中文本,所以想到了从文本中重新提取,然后提取的时候发现,原来一个文本中可能同时出现多个相同的身份描述;
#如下面计数输出可见,正则表达式都会找完为止,因为findall 返回的是list类型,而如果这样直接把多个list内元素连接会出现重复,
#那么最好的方法就是去重,于是采用了set(listObject)的方法,先把重复的去掉,就每种身份词只留下一个了。
print( final_Data_clean['stage'].value_counts() ) #还有1416是确实没有发现文本中有种类分类的,故置空处理
final_Data_clean['stage']=final_Data_clean['stage'].apply(lambda x:"-".join(set(x))) #原本只想到合并使用join,确实set()方法是看了网上后借用的。
print("置空前")
print(final_Data_clean['stage'].value_counts() ) #还有1416是确实没有发现文本中有种类分类的,故置空处理
final_Data_clean['stage']=final_Data_clean['stage'].replace("",np.nan)
#测试,验证已完成。
print()
print("置空后")
print(final_Data_clean['stage'].value_counts()) #还有1416是确实没有发现文本中有种类分类的,故置空处理
final_Data_clean.drop(final_Data_clean[['doggo','puppo','pupper','floofer']],axis=1,inplace=True) #把整理
final_Data_clean.info()
[] 1362
[pupper] 171
[doggo] 54
[puppo] 24
[pupper, pupper] 6
[doggo, pupper] 4
[floofer] 3
[puppo, doggo] 2
[pupper, pupper, pupper] 1
[pupper, doggo, doggo] 1
Name: stage, dtype: int64
置空前
1362
pupper 178
doggo 54
puppo 24
pupper-doggo 5
floofer 3
puppo-doggo 2
Name: stage, dtype: int64
置空后
pupper 178
doggo 54
puppo 24
pupper-doggo 5
floofer 3
puppo-doggo 2
Name: stage, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1628 entries, 1 to 2072
Data columns (total 26 columns):
tweet_id 1628 non-null object
timestamp 1628 non-null object
source 1628 non-null object
text 1628 non-null object
retweeted_status_id 0 non-null float64
retweeted_status_user_id 0 non-null float64
retweeted_status_timestamp 0 non-null object
expanded_urls 1628 non-null object
rating_numerator 1628 non-null int64
rating_denominator 1628 non-null int64
name 1166 non-null object
jpg_url 1628 non-null object
img_num 1628 non-null int64
p1 1628 non-null object
p1_conf 1628 non-null float64
p1_dog 1628 non-null bool
p2 1628 non-null object
p2_conf 1628 non-null float64
p2_dog 1628 non-null bool
p3 1628 non-null object
p3_conf 1628 non-null float64
p3_dog 1628 non-null bool
retweet_count 1628 non-null int64
favorite_count 1628 non-null int64
Final_Grade 1628 non-null object
stage 266 non-null object
dtypes: bool(3), float64(5), int64(5), object(13)
memory usage: 390.0+ KB
保存清理和合并后的数据集
# 保存清洁的数据集到twitter_archive_master.csv:
final_Data_clean.to_csv("twitter_archive_master.csv", encoding='utf-8') #存入
print("保存完毕!twitter_archive_master.csv")
保存完毕!twitter_archive_master.csv
4.分析
题出问题
- 狗狗的stage中哪种身份最多?
- 数据集中,排名前10的最常见的10个名字是哪些?
- 数据集中,狗狗的评分大部分是多少?
twitter_archive_master= pd.read_csv("twitter_archive_master.csv") #读取整理好的数据集中的数据
twitter_archive_master.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1628 entries, 0 to 1627
Data columns (total 27 columns):
Unnamed: 0 1628 non-null int64
tweet_id 1628 non-null int64
timestamp 1628 non-null object
source 1628 non-null object
text 1628 non-null object
retweeted_status_id 0 non-null float64
retweeted_status_user_id 0 non-null float64
retweeted_status_timestamp 0 non-null float64
expanded_urls 1628 non-null object
rating_numerator 1628 non-null int64
rating_denominator 1628 non-null int64
name 1166 non-null object
jpg_url 1628 non-null object
img_num 1628 non-null int64
p1 1628 non-null object
p1_conf 1628 non-null float64
p1_dog 1628 non-null bool
p2 1628 non-null object
p2_conf 1628 non-null float64
p2_dog 1628 non-null bool
p3 1628 non-null object
p3_conf 1628 non-null float64
p3_dog 1628 non-null bool
retweet_count 1628 non-null int64
favorite_count 1628 non-null int64
Final_Grade 1628 non-null float64
stage 266 non-null object
dtypes: bool(3), float64(7), int64(7), object(10)
memory usage: 310.1+ KB
# 分析 狗狗的stage中哪种身份最多?
plt.title("Proportion of doys in each stage")
twitter_archive_master.stage.value_counts().plot(kind="pie",figsize=(8, 8),autopct='%.1f')
#结论: 可以看到下图,最多的是处于pupper的,有66.9的狗处于这个身份
<matplotlib.axes._subplots.AxesSubplot at 0x1c04945a080>
# 分析 数据集中,大家最喜欢给狗狗取什么名字?
plt.title("Top 10 favorite names")
plt.xlabel("Names")
plt.ylabel("Quantity")
print(twitter_archive_master.name.value_counts().iloc[0:10])
twitter_archive_master.name.value_counts().iloc[0:10].sort_values().plot("barh")
#结论,排名前10的最常见的10个名字如下
Cooper 10
Lucy 10
Charlie 10
Oliver 9
Tucker 8
Winston 7
Daisy 7
Penny 7
Sadie 7
Jax 6
Name: name, dtype: int64
<matplotlib.axes._subplots.AxesSubplot at 0x1c049385c88>

print(twitter_archive_master.Final_Grade.value_counts().iloc[0:10])
twitter_archive_master.Final_Grade.value_counts().iloc[0:10].plot("pie",autopct='%.2f',figsize=(8, 8))
#结论:下图可见最大面积的4个扇形都是1分及以上,22+25+21+13=81,百分之81的狗狗的评分是一分及以上的。
1.2 407
1.0 356
1.1 348
1.3 214
0.9 134
0.8 68
0.7 31
1.4 22
0.6 16
0.5 14
Name: Final_Grade, dtype: int64
<matplotlib.axes._subplots.AxesSubplot at 0x1c0496d3be0>