In [1]:
import requests
import os,re,json
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
pd.set_option("max_colwidth",1000000)
pd.set_option('max_columns', 50)

Gather

Create a data frame twitter_archive

In [2]:
url="https://d17h27t6h515a5.cloudfront.net/topher/2017/August/59a4e958_twitter-archive-enhanced/twitter-archive-enhanced.csv"
twitter_archive=pd.read_csv(url)
twitter_archive.head(2)
Out[2]:
tweet_id in_reply_to_status_id in_reply_to_user_id timestamp source text retweeted_status_id retweeted_status_user_id retweeted_status_timestamp expanded_urls rating_numerator rating_denominator name doggo floofer pupper puppo
0 892420643555336193 NaN NaN 2017-08-01 16:23:56 +0000 <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU NaN NaN NaN https://twitter.com/dog_rates/status/892420643555336193/photo/1 13 10 Phineas None None None None
1 892177421306343426 NaN NaN 2017-08-01 00:17:27 +0000 <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https://t.co/0Xxu71qeIV NaN NaN NaN https://twitter.com/dog_rates/status/892177421306343426/photo/1 13 10 Tilly None None None None
In [4]:
twitter_archive.shape
Out[4]:
(2356, 17)

Create a data frame images

In [6]:
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)

with open(url.split('/')[-1], mode='wb') as file:
    file.write(response.content)
    
images = pd.read_csv('image-predictions.tsv', sep='\t')
images.head(2)
Out[6]:
tweet_id jpg_url img_num p1 p1_conf p1_dog p2 p2_conf p2_dog p3 p3_conf p3_dog
0 666020888022790149 https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg 1 Welsh_springer_spaniel 0.465074 True collie 0.156665 True Shetland_sheepdog 0.061428 True
1 666029285002620928 https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg 1 redbone 0.506826 True miniature_pinscher 0.074192 True Rhodesian_ridgeback 0.072010 True
In [6]:
images.shape
Out[6]:
(2075, 12)

Merge twitter_archive and images by tweet_id

In [7]:
archive_images=pd.merge(twitter_archive, images, how='outer',on='tweet_id')
archive_images.shape
Out[7]:
(2356, 28)

Search by tweed_id and retrieve favorite_count and retweet_count

In [3]:
import tweepy

consumer_key = 
consumer_secret = 

access_token = 
access_token_secret = 


auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth, wait_on_rate_limit=True)
In [11]:
tweet_list=[]


for tweet_id in archive_images['tweet_id']:
    try:
        tweet_status=api.get_status(tweet_id)._json
        favorite_count = tweet_status['favorite_count']
        retweet_count = tweet_status['retweet_count']
        tweet_list.append({'tweet_id': int(tweet_id),'favorite_count': int(favorite_count),'retweet_count': int(retweet_count)})
    
    except Exception:  
        print("error ids:  " +str(tweet_id))
        tweet_list.append({'tweet_id': int(tweet_id)})
error ids:  888202515573088257
error ids:  873697596434513921
error ids:  869988702071779329
error ids:  861769973181624320
error ids:  842892208864923648
error ids:  802247111496568832
error ids:  775096608509886464
In [12]:
tweet_table = pd.DataFrame(tweet_list, columns = ['tweet_id', 'favorite_count', 'retweet_count'])
In [13]:
tweet_table.to_csv('tweet_table.csv')
In [14]:
tweet_table=pd.read_csv('tweet_table.csv')
tweet_table.head(10)
Out[14]:
Unnamed: 0 tweet_id favorite_count retweet_count
0 0 892420643555336193 39306.0 8778.0
1 1 892177421306343426 33661.0 6428.0
2 2 891815181378084864 25358.0 4268.0
3 3 891689557279858688 42684.0 8863.0
4 4 891327558926688256 40841.0 9645.0
5 5 891087950875897856 20474.0 3213.0
6 6 890971913173991426 12012.0 2126.0
7 7 890729181411237888 66395.0 19421.0
8 8 890609185150312448 28103.0 4366.0
9 9 890240255349198849 32344.0 7617.0
In [15]:
tweet_table[tweet_table['favorite_count']==0]
Out[15]:
Unnamed: 0 tweet_id favorite_count retweet_count
32 32 886054160059072513 0.0 106.0
36 36 885311592912609280 0.0 19088.0
68 68 879130579576475649 0.0 7079.0
73 73 878404777348136964 0.0 1333.0
74 74 878316110768087041 0.0 6872.0
78 78 877611172832227328 0.0 82.0
91 91 874434818259525634 0.0 15324.0
97 97 873337748698140672 0.0 1642.0
101 101 872668790621863937 0.0 31.0
109 109 871166179821445120 0.0 5925.0
124 124 868639477480148993 0.0 2203.0
130 130 867072653475098625 0.0 133.0
132 132 866816280283807744 0.0 32510.0
137 137 866094527597207552 0.0 8823.0
146 146 863471782782697472 0.0 2636.0
159 159 860981674716409858 0.0 2295.0
160 160 860924035999428608 0.0 874.0
165 165 860177593139703809 0.0 32992.0
171 171 858860390427611136 0.0 8708.0
180 180 857062103051644929 0.0 182.0
182 182 856602993587888130 0.0 11505.0
185 185 856330835276025856 0.0 723.0
194 194 855245323840757760 0.0 6481.0
195 195 855138241867124737 0.0 50.0
204 204 852936405516943360 0.0 2209.0
211 211 851953902622658560 0.0 10586.0
212 212 851861385021730816 0.0 23.0
222 222 849668094696017920 0.0 5461.0
230 230 847978865427394560 0.0 3609.0
231 231 847971574464610304 0.0 474.0
... ... ... ... ...
778 778 775898661951791106 0.0 17403.0
794 794 773336787167145985 0.0 5827.0
800 800 772615324260794368 0.0 3854.0
811 811 771171053431250945 0.0 8600.0
815 815 771004394259247104 0.0 251.0
818 818 770743923962707968 0.0 51801.0
822 822 770093767776997377 0.0 3470.0
826 826 769335591808995329 0.0 8738.0
829 829 768909767477751808 0.0 3087.0
833 833 768554158521745409 0.0 6633.0
841 841 766864461642756096 0.0 6436.0
847 847 766078092750233600 0.0 2941.0
860 860 763167063695355904 0.0 3443.0
868 868 761750502866649088 0.0 4482.0
872 872 761371037149827077 0.0 20284.0
885 885 760153949710192640 0.0 38.0
890 890 759566828574212096 0.0 24037.0
895 895 759159934323924993 0.0 1336.0
908 908 757729163776290825 0.0 9197.0
911 911 757597904299253760 0.0 329.0
926 926 754874841593970688 0.0 9081.0
937 937 753298634498793472 0.0 6538.0
943 943 752701944171524096 0.0 3255.0
949 949 752309394570878976 0.0 18801.0
1012 1012 747242308580548608 0.0 3229.0
1023 1023 746521445350707200 0.0 1096.0
1043 1043 743835915802583040 0.0 2355.0
1242 1242 711998809858043904 0.0 138.0
2259 2259 667550904950915073 0.0 37.0
2260 2260 667550882905632768 0.0 34.0

174 rows × 4 columns

In [16]:
tweet_table.describe
Out[16]:
<bound method NDFrame.describe of       Unnamed: 0            tweet_id  favorite_count  retweet_count
0              0  892420643555336193         39306.0         8778.0
1              1  892177421306343426         33661.0         6428.0
2              2  891815181378084864         25358.0         4268.0
3              3  891689557279858688         42684.0         8863.0
4              4  891327558926688256         40841.0         9645.0
5              5  891087950875897856         20474.0         3213.0
6              6  890971913173991426         12012.0         2126.0
7              7  890729181411237888         66395.0        19421.0
8              8  890609185150312448         28103.0         4366.0
9              9  890240255349198849         32344.0         7617.0
10            10  890006608113172480         30993.0         7526.0
11            11  889880896479866881         28103.0         5084.0
12            12  889665388333682689         38570.0         8445.0
13            13  889638837579907072         27510.0         4663.0
14            14  889531135344209921         15285.0         2291.0
15            15  889278841981685760         25614.0         5584.0
16            16  888917238123831296         29444.0         4644.0
17            17  888804989199671297         25919.0         4494.0
18            18  888554962724278272         20186.0         3696.0
19            19  888202515573088257             NaN            NaN
20            20  888078434458587136         22053.0         3610.0
21            21  887705289381826560         30577.0         5546.0
22            22  887517139158093824         46755.0        11971.0
23            23  887473957103951883         70023.0        18749.0
24            24  887343217045368832         34077.0        10649.0
25            25  887101392804085760         30932.0         6113.0
26            26  886983233522544640         35643.0         7978.0
27            27  886736880519319552         12237.0         3389.0
28            28  886680336477933568         22701.0         4583.0
29            29  886366144734445568         21398.0         3269.0
...          ...                 ...             ...            ...
2326        2326  666411507551481857           456.0          337.0
2327        2327  666407126856765440           112.0           42.0
2328        2328  666396247373291520           170.0           90.0
2329        2329  666373753744588802           193.0           95.0
2330        2330  666362758909284353           798.0          587.0
2331        2331  666353288456101888           226.0           74.0
2332        2332  666345417576210432           305.0          144.0
2333        2333  666337882303524864           202.0           95.0
2334        2334  666293911632134144           515.0          365.0
2335        2335  666287406224695296           151.0           70.0
2336        2336  666273097616637952           181.0           80.0
2337        2337  666268910803644416           107.0           36.0
2338        2338  666104133288665088         14623.0         6791.0
2339        2339  666102155909144576            80.0           14.0
2340        2340  666099513787052032           159.0           72.0
2341        2341  666094000022159362           165.0           77.0
2342        2342  666082916733198337           119.0           46.0
2343        2343  666073100786774016           329.0          172.0
2344        2344  666071193221509120           153.0           65.0
2345        2345  666063827256086533           490.0          226.0
2346        2346  666058600524156928           116.0           59.0
2347        2347  666057090499244032           302.0          144.0
2348        2348  666055525042405380           447.0          259.0
2349        2349  666051853826850816          1243.0          873.0
2350        2350  666050758794694657           135.0           59.0
2351        2351  666049248165822465           110.0           40.0
2352        2352  666044226329800704           306.0          143.0
2353        2353  666033412701032449           127.0           46.0
2354        2354  666029285002620928           131.0           47.0
2355        2355  666020888022790149          2527.0          527.0

[2356 rows x 4 columns]>
In [17]:
tweet_table['favorite_count'].value_counts()
Out[17]:
0.0        174
5161.0       3
1376.0       3
2279.0       3
3573.0       3
3494.0       3
315.0        3
1841.0       3
2024.0       3
1254.0       3
912.0        3
2245.0       3
1067.0       3
467.0        3
447.0        3
2434.0       3
3719.0       3
2889.0       2
17372.0      2
3832.0       2
1709.0       2
2427.0       2
1491.0       2
1173.0       2
538.0        2
28103.0      2
1653.0       2
770.0        2
1753.0       2
193.0        2
          ... 
57784.0      1
7058.0       1
19922.0      1
9018.0       1
3539.0       1
39298.0      1
419.0        1
131.0        1
23098.0      1
2251.0       1
466.0        1
46755.0      1
7051.0       1
306.0        1
39996.0      1
12040.0      1
20952.0      1
8788.0       1
2870.0       1
16393.0      1
2383.0       1
1671.0       1
25090.0      1
2058.0       1
31732.0      1
10593.0      1
14194.0      1
10099.0      1
23379.0      1
117.0        1
Name: favorite_count, Length: 2004, dtype: int64
In [18]:
tweet_json = pd.merge(archive_images, tweet_table, how='outer',on='tweet_id')
In [19]:
tweet_json.head(2)
Out[19]:
tweet_id in_reply_to_status_id in_reply_to_user_id timestamp source text retweeted_status_id retweeted_status_user_id retweeted_status_timestamp expanded_urls rating_numerator rating_denominator name doggo floofer pupper puppo jpg_url img_num p1 p1_conf p1_dog p2 p2_conf p2_dog p3 p3_conf p3_dog Unnamed: 0 favorite_count retweet_count
0 892420643555336193 NaN NaN 2017-08-01 16:23:56 +0000 <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU NaN NaN NaN https://twitter.com/dog_rates/status/892420643555336193/photo/1 13 10 Phineas None None None None https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg 1.0 orange 0.097049 False bagel 0.085851 False banana 0.076110 False 0 39306.0 8778.0
1 892177421306343426 NaN NaN 2017-08-01 00:17:27 +0000 <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https://t.co/0Xxu71qeIV NaN NaN NaN https://twitter.com/dog_rates/status/892177421306343426/photo/1 13 10 Tilly None None None None https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg 1.0 Chihuahua 0.323581 True Pekinese 0.090647 True papillon 0.068957 True 1 33661.0 6428.0
In [28]:
tweet_json.to_csv('tweet_json.txt')

Assess

In [3]:
tweet_json=pd.read_csv('tweet_json.txt')
In [30]:
tweet_json.head(5)
Out[30]:
Unnamed: 0 Unnamed: 0.1 tweet_id in_reply_to_status_id in_reply_to_user_id timestamp source text retweeted_status_id retweeted_status_user_id retweeted_status_timestamp expanded_urls rating_numerator rating_denominator name doggo floofer pupper puppo jpg_url img_num p1 p1_conf p1_dog p2 p2_conf p2_dog p3 p3_conf p3_dog Unnamed: 0.1.1 favorite_count retweet_count
0 0 0 892420643555336193 NaN NaN 2017-08-01 16:23:56 +0000 <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU NaN NaN NaN https://twitter.com/dog_rates/status/892420643555336193/photo/1 13 10 Phineas None None None None https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg 1.0 orange 0.097049 False bagel 0.085851 False banana 0.076110 False 0 39306.0 8778.0
1 1 1 892177421306343426 NaN NaN 2017-08-01 00:17:27 +0000 <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https://t.co/0Xxu71qeIV NaN NaN NaN https://twitter.com/dog_rates/status/892177421306343426/photo/1 13 10 Tilly None None None None https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg 1.0 Chihuahua 0.323581 True Pekinese 0.090647 True papillon 0.068957 True 1 33661.0 6428.0
2 2 2 891815181378084864 NaN NaN 2017-07-31 00:18:03 +0000 <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> This is Archie. He is a rare Norwegian Pouncing Corgo. Lives in the tall grass. You never know when one may strike. 12/10 https://t.co/wUnZnhtVJB NaN NaN NaN https://twitter.com/dog_rates/status/891815181378084864/photo/1 12 10 Archie None None None None https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg 1.0 Chihuahua 0.716012 True malamute 0.078253 True kelpie 0.031379 True 2 25358.0 4268.0
3 3 3 891689557279858688 NaN NaN 2017-07-30 15:58:51 +0000 <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> This is Darla. She commenced a snooze mid meal. 13/10 happens to the best of us https://t.co/tD36da7qLQ NaN NaN NaN https://twitter.com/dog_rates/status/891689557279858688/photo/1 13 10 Darla None None None None https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg 1.0 paper_towel 0.170278 False Labrador_retriever 0.168086 True spatula 0.040836 False 3 42684.0 8863.0
4 4 4 891327558926688256 NaN NaN 2017-07-29 16:00:24 +0000 <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> This is Franklin. He would like you to stop calling him "cute." He is a very fierce shark and should be respected as such. 12/10 #BarkWeek https://t.co/AtUZn91f7f NaN NaN NaN https://twitter.com/dog_rates/status/891327558926688256/photo/1,https://twitter.com/dog_rates/status/891327558926688256/photo/1 12 10 Franklin None None None None https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg 2.0 basset 0.555712 True English_springer 0.225770 True German_short-haired_pointer 0.175219 True 4 40841.0 9645.0
In [23]:
tweet_json.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 32 columns):
Unnamed: 0                    2356 non-null int64
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
jpg_url                       2075 non-null object
img_num                       2075 non-null float64
p1                            2075 non-null object
p1_conf                       2075 non-null float64
p1_dog                        2075 non-null object
p2                            2075 non-null object
p2_conf                       2075 non-null float64
p2_dog                        2075 non-null object
p3                            2075 non-null object
p3_conf                       2075 non-null float64
p3_dog                        2075 non-null object
Unnamed: 0.1                  2356 non-null int64
favorite_count                2349 non-null float64
retweet_count                 2349 non-null float64
dtypes: float64(10), int64(5), object(17)
memory usage: 589.1+ KB
In [24]:
tweet_json.describe()
Out[24]:
Unnamed: 0 tweet_id in_reply_to_status_id in_reply_to_user_id retweeted_status_id retweeted_status_user_id rating_numerator rating_denominator img_num p1_conf p2_conf p3_conf Unnamed: 0.1 favorite_count retweet_count
count 2356.000000 2.356000e+03 7.800000e+01 7.800000e+01 1.810000e+02 1.810000e+02 2356.000000 2356.000000 2075.000000 2075.000000 2.075000e+03 2.075000e+03 2356.000000 2349.000000 2349.000000
mean 1177.500000 7.427716e+17 7.455079e+17 2.014171e+16 7.720400e+17 1.241698e+16 13.126486 10.455433 1.203855 0.594548 1.345886e-01 6.032417e-02 1177.500000 8106.075351 3094.855258
std 680.262939 6.856705e+16 7.582492e+16 1.252797e+17 6.236928e+16 9.599254e+16 45.876648 6.745237 0.561875 0.271174 1.006657e-01 5.090593e-02 680.262939 12048.901960 5129.976423
min 0.000000 6.660209e+17 6.658147e+17 1.185634e+07 6.661041e+17 7.832140e+05 0.000000 0.000000 1.000000 0.044333 1.011300e-08 1.740170e-10 0.000000 0.000000 0.000000
25% 588.750000 6.783989e+17 6.757419e+17 3.086374e+08 7.186315e+17 4.196984e+09 10.000000 10.000000 1.000000 0.364412 5.388625e-02 1.622240e-02 588.750000 1414.000000 615.000000
50% 1177.500000 7.196279e+17 7.038708e+17 4.196984e+09 7.804657e+17 4.196984e+09 11.000000 10.000000 1.000000 0.588230 1.181810e-01 4.944380e-02 1177.500000 3585.000000 1444.000000
75% 1766.250000 7.993373e+17 8.257804e+17 4.196984e+09 8.203146e+17 4.196984e+09 12.000000 10.000000 1.000000 0.843855 1.955655e-01 9.180755e-02 1766.250000 10090.000000 3604.000000
max 2355.000000 8.924206e+17 8.862664e+17 8.405479e+17 8.874740e+17 7.874618e+17 1776.000000 170.000000 4.000000 1.000000 4.880140e-01 2.734190e-01 2355.000000 131748.000000 78704.000000
In [25]:
tweet_json['name'].value_counts(dropna=False)
Out[25]:
None           745
a               55
Charlie         12
Cooper          11
Oliver          11
Lucy            11
Lola            10
Penny           10
Tucker          10
Winston          9
Bo               9
Sadie            8
the              8
an               7
Toby             7
Buddy            7
Daisy            7
Bailey           7
Scout            6
Jax              6
Oscar            6
Dave             6
Stanley          6
Leo              6
Rusty            6
Koda             6
Milo             6
Jack             6
Bella            6
Gus              5
              ... 
Bode             1
Meatball         1
Brandi           1
Smiley           1
Chubbs           1
my               1
Lillie           1
Jazzy            1
Snicku           1
Cheesy           1
Pippin           1
Autumn           1
Richie           1
Venti            1
Maya             1
Baron            1
Jaspers          1
Bilbo            1
Kara             1
Bobb             1
Rooney           1
Billl            1
Kobe             1
Stuart           1
Nimbus           1
infuriating      1
Mutt             1
Clybe            1
Dante            1
Margo            1
Name: name, Length: 957, dtype: int64
In [26]:
tweet_json['favorite_count'].isnull().value_counts()
Out[26]:
False    2349
True        7
Name: favorite_count, dtype: int64

Quality

  1. The type of timestamp is an object rather than time format.

  2. The type of tweet_id and in_reply_to_status_id is numeric rather than string.

  3. The type of in_reply_to_user_id is numeric rather than string. There are two tweet_id appearing in the column in_reply_to_user_id which is for user_id only.

  4. The column source contains the whole tag instead of the text.

  5. There are emoji characters e.g. 🎶 inside the column text.The unicode character may not correctly be presented in other editor.

  6. The type of retweeted_status_id is numeric rather than string.

  7. The type of retweeted_status_user_id is numeric rather than string. There are three tweet_id appearing in the column retweeted_status_user_id which is for user_id only.

  8. The rating_numerator can be any positive integer, similarly the rating_denominator can be any positive integer out of 10. In fact, any number is possible because it is not a strict technical rating. There are a couple of records with a wrong rating because the author parsed the first fraction rather than the last. There are some tweets without a rating but the author mistakenly parsed a rating for it. There are some retweets back and forth upgrading their previous rating, of which those retweets do not attach links to the picture of the dog they are talking about. It's hard to track back that dog and merge these retweets with the previous rating tweets, I'd like to delete the retweets that is discussing the rating without a dog picture.

  9. The name column has meaningless dog name, like 'a', 'an', either the dog name does not exist in the text column or failure to retrieve the dog name. Also, @dog_rates can mispell a dog's name, e.g. "Johm".

  10. The favorite_count column has multiple 0 counts but with large number of retweet count, it's almost impossible except for the 'Like' button is disabled. There are 7 records with missing favorite_count and retweet_count because the status does not exist for that tweet.

Tidiness

  1. The column of expanded_urls contains records with multiple links, some records have duplicated links splitted by a comma. The reason is that a grid photo was tweeted, so multiple pictures being recorded, but only one of the sub-photo was use for image prediction.

  2. The columns of doggo floofer pupper puppo are all dog stages, converting these four columns into a stage column should be considered.

  3. There should be a column that prints the final breed prediction by summarizing p1 p2 p3 and p1_dog p2_dog p3_dog

Clean

Make a copy of tweet_json.

In [3]:
tweet_json=pd.read_csv(r'tweet_json.csv')
tweet_json_cleaned=tweet_json.copy()

Change the type of several columns

Change the type of timestamp and retweeted_status_timestamp

Change the type of tweet_id in_reply_to_status_id in_reply_to_user_id retweeted_status_id retweeted_status_user_id

In [4]:
tweet_json_cleaned['timestamp'] = pd.to_datetime(tweet_json_cleaned['timestamp'],utc=True)
tweet_json_cleaned['retweeted_status_timestamp'] = pd.to_datetime(tweet_json_cleaned['retweeted_status_timestamp'],utc=True)

tweet_json_cleaned['tweet_id'] = tweet_json_cleaned['tweet_id'].astype(int).astype(str)

tweet_json_cleaned['in_reply_to_status_id']=tweet_json_cleaned['in_reply_to_status_id'].apply(lambda x : str(int(x)) if not np.isnan(x) else x)
tweet_json_cleaned['in_reply_to_user_id'] = tweet_json_cleaned['in_reply_to_user_id'].apply(lambda x : str(int(x)) if not np.isnan(x) else x)
tweet_json_cleaned['retweeted_status_id'] = tweet_json_cleaned['retweeted_status_id'].apply(lambda x : str(int(x)) if not np.isnan(x) else x)
tweet_json_cleaned['retweeted_status_user_id'] = tweet_json_cleaned['retweeted_status_user_id'].apply(lambda x : str(int(x)) if not np.isnan(x) else x)
tweet_json_cleaned.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 33 columns):
Unnamed: 0                    2356 non-null int64
Unnamed: 0.1                  2356 non-null int64
tweet_id                      2356 non-null object
in_reply_to_status_id         78 non-null object
in_reply_to_user_id           78 non-null object
timestamp                     2356 non-null datetime64[ns]
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null object
retweeted_status_user_id      181 non-null object
retweeted_status_timestamp    181 non-null datetime64[ns]
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
jpg_url                       2075 non-null object
img_num                       2075 non-null float64
p1                            2075 non-null object
p1_conf                       2075 non-null float64
p1_dog                        2075 non-null object
p2                            2075 non-null object
p2_conf                       2075 non-null float64
p2_dog                        2075 non-null object
p3                            2075 non-null object
p3_conf                       2075 non-null float64
p3_dog                        2075 non-null object
Unnamed: 0.1.1                2356 non-null int64
favorite_count                2349 non-null float64
retweet_count                 2349 non-null float64
dtypes: datetime64[ns](2), float64(6), int64(5), object(20)
memory usage: 607.5+ KB

Create a stage column

Convert doggo floofer pupper puppo into a stage variable.

In [5]:
tweet_json_cleaned['stage']=tweet_json_cleaned['doggo']+tweet_json_cleaned['floofer']+tweet_json_cleaned['pupper']+tweet_json_cleaned['puppo']
In [6]:
def concat_stage(x):
    temp=re.sub('None','',x)
    if temp=="" :
        return 'None'
    else:
        return temp

tweet_json_cleaned['stage']=tweet_json_cleaned['stage'].apply(lambda x : concat_stage(x))
In [7]:
print(tweet_json_cleaned['stage'].value_counts())
None            1976
pupper           245
doggo             83
puppo             29
doggopupper       12
floofer            9
doggofloofer       1
doggopuppo         1
Name: stage, dtype: int64

Extract text from source column

In [8]:
tweet_json_cleaned['source']=tweet_json_cleaned['source'].apply(lambda x : BeautifulSoup(x, 'html.parser').get_text())
In [10]:
print(tweet_json_cleaned['source'].value_counts())
Twitter for iPhone     2221
Vine - Make a Scene      91
Twitter Web Client       33
TweetDeck                11
Name: source, dtype: int64

Create a breed prediction column

Create a breed_predict column.

In [12]:
for index, row in tweet_json_cleaned.iterrows():
    if row['p1_dog'] == True:
        tweet_json_cleaned.set_value(index,'breed_predict',row['p1'])
        
    elif row['p2_dog'] == True:
        tweet_json_cleaned.set_value(index,'breed_predict',row['p2'])
        
    elif row['p3_dog'] == True:
        tweet_json_cleaned.set_value(index,'breed_predict',row['p3'])


tweet_json_cleaned['breed_predict'].value_counts()
Out[12]:
golden_retriever                  173
Labrador_retriever                113
Pembroke                           96
Chihuahua                          95
pug                                65
toy_poodle                         52
chow                               51
Samoyed                            46
Pomeranian                         42
malamute                           34
cocker_spaniel                     34
French_bulldog                     32
Chesapeake_Bay_retriever           31
miniature_pinscher                 26
Cardigan                           23
Eskimo_dog                         22
Staffordshire_bullterrier          22
beagle                             21
German_shepherd                    21
Shih-Tzu                           20
Siberian_husky                     20
Shetland_sheepdog                  19
Rottweiler                         19
kuvasz                             19
Lakeland_terrier                   19
Maltese_dog                        19
Italian_greyhound                  17
basset                             17
West_Highland_white_terrier        16
American_Staffordshire_terrier     16
                                 ... 
Afghan_hound                        4
bluetick                            4
Scottish_deerhound                  4
Tibetan_terrier                     4
Welsh_springer_spaniel              4
giant_schnauzer                     4
Weimaraner                          4
Leonberg                            3
toy_terrier                         3
Irish_water_spaniel                 3
komondor                            3
Greater_Swiss_Mountain_dog          3
briard                              3
Brabancon_griffon                   3
curly-coated_retriever              3
cairn                               3
Australian_terrier                  2
black-and-tan_coonhound             2
groenendael                         2
wire-haired_fox_terrier             2
Appenzeller                         2
Sussex_spaniel                      2
Scotch_terrier                      1
Japanese_spaniel                    1
EntleBucher                         1
silky_terrier                       1
Irish_wolfhound                     1
clumber                             1
standard_schnauzer                  1
Bouvier_des_Flandres                1
Name: breed_predict, Length: 113, dtype: int64

Remake favorite count

For those tweets having 0 favorite_count, retrieve favorite_count from the retweet_status instead.

In [35]:
for index,row in tweet_json_cleaned.iterrows():
    if row['favorite_count']==0:
        try:
            tweet_status=api.get_status(int(row['tweet_id']))._json
            favorite_count = tweet_status['retweeted_status']['favorite_count']
            tweet_json_cleaned.set_value(index,'favorite_count',favorite_count)

            
        except Exception:  
            print("error ids:  " +row['tweet_id'])

  
        
In [36]:
tweet_json_cleaned['favorite_count'].value_counts(dropna=False)
Out[36]:
NaN         7
 1067.0     3
 1376.0     3
 2245.0     3
 912.0      3
 467.0      3
 3494.0     3
 2279.0     3
 5161.0     3
 3719.0     3
 1254.0     3
 2024.0     3
 315.0      3
 2434.0     3
 3573.0     3
 447.0      3
 1841.0     3
 2866.0     2
 3771.0     2
 12913.0    2
 487.0      2
 2869.0     2
 8845.0     2
 3142.0     2
 5704.0     2
 3201.0     2
 6951.0     2
 3687.0     2
 2657.0     2
 2969.0     2
           ..
 2058.0     1
 25090.0    1
 430.0      1
 1114.0     1
 482.0      1
 495.0      1
 780.0      1
 28881.0    1
 7058.0     1
 18548.0    1
 5542.0     1
 39298.0    1
 387.0      1
 131.0      1
 23098.0    1
 325.0      1
 2251.0     1
 466.0      1
 46755.0    1
 7051.0     1
 5295.0     1
 39996.0    1
 19922.0    1
 12040.0    1
 8788.0     1
 2870.0     1
 16393.0    1
 3599.0     1
 1671.0     1
 13684.0    1
Name: favorite_count, Length: 2135, dtype: int64
In [37]:
tweet_json_cleaned[tweet_json_cleaned['favorite_count'].isnull()]
Out[37]:
Unnamed: 0 Unnamed: 0.1 tweet_id in_reply_to_status_id in_reply_to_user_id timestamp source text retweeted_status_id retweeted_status_user_id retweeted_status_timestamp expanded_urls rating_numerator rating_denominator name doggo floofer pupper puppo jpg_url img_num p1 p1_conf p1_dog p2 p2_conf p2_dog p3 p3_conf p3_dog Unnamed: 0.1.1 favorite_count retweet_count stage breed_predict
19 19 19 888202515573088257 NaN NaN 2017-07-21 01:02:36 Twitter for iPhone RT @dog_rates: This is Canela. She attempted some fancy porch pics. They were unsuccessful. 13/10 someone help her https://t.co/cLyzpcUcMX 887473957103951872 4196983835 2017-07-19 00:47:34 https://twitter.com/dog_rates/status/887473957103951883/photo/1,https://twitter.com/dog_rates/status/887473957103951883/photo/1,https://twitter.com/dog_rates/status/887473957103951883/photo/1,https://twitter.com/dog_rates/status/887473957103951883/photo/1 13 10 Canela None None None None https://pbs.twimg.com/media/DFDw2tyUQAAAFke.jpg 2.0 Pembroke 0.809197 True Rhodesian_ridgeback 0.054950 True beagle 0.038915 True 19 NaN NaN None Pembroke
95 95 95 873697596434513921 NaN NaN 2017-06-11 00:25:14 Twitter for iPhone RT @dog_rates: This is Walter. He won't start hydrotherapy without his favorite floatie. 14/10 keep it pup Walter https://t.co/r28jFx9uyF 868880397819494400 4196983835 2017-05-28 17:23:24 https://twitter.com/dog_rates/status/868880397819494401/photo/1,https://twitter.com/dog_rates/status/868880397819494401/photo/1 14 10 Walter None None None None https://pbs.twimg.com/media/DA7iHL5U0AA1OQo.jpg 1.0 laptop 0.153718 False French_bulldog 0.099984 True printer 0.077130 False 95 NaN NaN None French_bulldog
118 118 118 869988702071779329 NaN NaN 2017-05-31 18:47:24 Twitter for iPhone RT @dog_rates: We only rate dogs. This is quite clearly a smol broken polar bear. We'd appreciate if you only send dogs. Thank you... 12/10… 859196978902773760 4196983835 2017-05-02 00:04:57 https://twitter.com/dog_rates/status/859196978902773760/video/1 12 10 quite None None None None NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 118 NaN NaN None NaN
155 155 155 861769973181624320 NaN NaN 2017-05-09 02:29:07 Twitter for iPhone RT @dog_rates: "Good afternoon class today we're going to learn what makes a good boy so good" 13/10 https://t.co/f1h2Fsalv9 806629075125202944 4196983835 2016-12-07 22:38:52 https://twitter.com/dog_rates/status/806629075125202948/photo/1,https://twitter.com/dog_rates/status/806629075125202948/photo/1,https://twitter.com/dog_rates/status/806629075125202948/photo/1,https://twitter.com/dog_rates/status/806629075125202948/photo/1 13 10 None None None None None https://pbs.twimg.com/media/CzG425nWgAAnP7P.jpg 2.0 Arabian_camel 0.366248 False house_finch 0.209852 False cocker_spaniel 0.046403 True 155 NaN NaN None cocker_spaniel
260 260 260 842892208864923648 NaN NaN 2017-03-18 00:15:37 Twitter for iPhone RT @dog_rates: This is Stephan. He just wants to help. 13/10 such a good boy https://t.co/DkBYaCAg2d 807106840509214720 4196983835 2016-12-09 06:17:20 https://twitter.com/dog_rates/status/807106840509214720/video/1,https://twitter.com/dog_rates/status/807106840509214720/video/1 13 10 Stephan None None None None https://pbs.twimg.com/ext_tw_video_thumb/807106774843039744/pu/img/8XZg1xW35Xp2J6JW.jpg 1.0 Chihuahua 0.505370 True Pomeranian 0.120358 True toy_terrier 0.077008 True 260 NaN NaN None Chihuahua
566 566 566 802247111496568832 NaN NaN 2016-11-25 20:26:31 Twitter for iPhone RT @dog_rates: Everybody drop what you're doing and look at this dog. 13/10 must be super h*ckin rare https://t.co/I1bJUzUEW5 779056095788752896 4196983835 2016-09-22 20:33:42 https://twitter.com/dog_rates/status/779056095788752897/photo/1,https://twitter.com/dog_rates/status/779056095788752897/photo/1,https://twitter.com/dog_rates/status/779056095788752897/photo/1,https://twitter.com/dog_rates/status/779056095788752897/photo/1 13 10 None None None None None https://pbs.twimg.com/media/Cs_DYr1XEAA54Pu.jpg 1.0 Chihuahua 0.721188 True toy_terrier 0.112943 True kelpie 0.053365 True 566 NaN NaN None Chihuahua
784 784 784 775096608509886464 NaN NaN 2016-09-11 22:20:06 Twitter for iPhone RT @dog_rates: After so many requests, this is Bretagne. She was the last surviving 9/11 search dog, and our second ever 14/10. RIP https:/… 740373189193256960 4196983835 2016-06-08 02:41:38 https://twitter.com/dog_rates/status/740373189193256964/photo/1,https://twitter.com/dog_rates/status/740373189193256964/photo/1,https://twitter.com/dog_rates/status/740373189193256964/photo/1,https://twitter.com/dog_rates/status/740373189193256964/photo/1 9 11 None None None None None NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 784 NaN NaN None NaN

Remake ratings

Remake rating_numerator and rating_denominator.

First, change the type of rating_numerator and rating_denominator into float. Create a dog count column labelling whether the row has multiple dogs or single dog.

In [13]:
tweet_json_cleaned['rating_numerator']=tweet_json_cleaned['rating_numerator'].astype(float)
tweet_json_cleaned['rating_denominator']=tweet_json_cleaned['rating_denominator'].astype(float)
In [14]:
for index,row in tweet_json_cleaned.iterrows():
            rating=[]
            rating=re.findall(r'(\d+(\.\d+)?)\/(\d+)',row['text'],flags=0)
            if rating !=[]:
                if len(rating)>1 and rating[0][-1]!='10':
                          tweet_json_cleaned.set_value(index,'dog_count','multiple')
                else:
                          tweet_json_cleaned.set_value(index,'dog_count','single')
                        
                tweet_json_cleaned.set_value(index,'rating_numerator',rating[-1][0])
                tweet_json_cleaned.set_value(index,'rating_denominator',rating[-1][-1])
            else:
                tweet_json_cleaned.set_value(index,'rating_numerator',np.nan)
                tweet_json_cleaned.set_value(index,'rating_denominator',np.nan)
In [40]:
print(tweet_json_cleaned['rating_numerator'].value_counts(dropna=False))
print(tweet_json_cleaned['rating_denominator'].value_counts(dropna=False))
12.00      557
11.00      467
10.00      454
13.00      355
9.00       156
8.00       104
7.00        56
14.00       56
5.00        34
6.00        33
3.00        19
4.00        17
2.00        10
1.00         9
0.00         2
15.00        2
9.75         2
420.00       2
84.00        1
24.00        1
13.50        1
143.00       1
80.00        1
99.00        1
182.00       1
165.00       1
45.00        1
204.00       1
1776.00      1
666.00       1
11.27        1
121.00       1
11.26        1
88.00        1
144.00       1
9.50         1
20.00        1
44.00        1
60.00        1
Name: rating_numerator, dtype: int64
10.0     2340
50.0        2
80.0        2
150.0       1
110.0       1
90.0        1
130.0       1
70.0        1
170.0       1
120.0       1
16.0        1
20.0        1
40.0        1
7.0         1
15.0        1
Name: rating_denominator, dtype: int64

Remake dog names

Define several rules to extract dog names from text column. Return the dog names to a new name column.

In [41]:
for index,row in tweet_json_cleaned.iterrows():
            text=row['text']
            ## Example: This is Charlie and Zoey.
            if re.match(r'.*This is [A-Z][A-Za-zñáéíóúü/-]+([.,!?\s]|( (and|(&amp;)) [A-Z][A-Za-zñáéíóúü/-]+))',text):
                
                name=re.findall(r'.*This is ([A-Z][A-Za-zñáéíóúü/-]+)[.,!?\s]',text)
                name2=re.findall(r'.* (and|(&amp;)) ([A-Z][A-Za-zñáéíóúü/-]+)',text)
                tweet_json_cleaned.set_value(index,'new_name',name[0])
            
                if name2 !=[]:
                    print(name2)
                    tweet_json_cleaned.set_value(index,'dog_count','multiple')
                    
            ## Example: Meet Charlie and Zoey.
            elif re.match(r'.*Meet [A-Z][A-Za-zñáéíóúü/-]+([.,!?\s]|( (and|(&amp;)) [A-Z][A-Za-zñáéíóúü/-]+))',text):
                
                name=re.findall(r'.*Meet ([A-Z][A-Za-zñáéíóúü/-]+)[.,!?\s]',text)  
                name2=re.findall(r'.* (and|(&amp;)) ([A-Z][A-Za-zñáéíóúü/-]+)',text)
                tweet_json_cleaned.set_value(index,'new_name',name[0])
                
                if name2 !=[]:
                    print(name2)
                    tweet_json_cleaned.set_value(index,'dog_count','multiple')
                    
            ## Example: Say hello to Charlie and Zoey.   
            elif re.match(r'.*Say hello to [A-Z][A-Za-zñáéíóúü/-]+([.,!?]|( (and|(&amp;)) [A-Z][A-Za-zñáéíóúü/-]+))',text):
                
                name=re.findall(r'.*Say hello to ([A-Z][A-Za-zñáéíóúü/-]+)[.,!?\s]',text)
                name2=re.findall(r'.* (and|(&amp;)) ([A-Z][A-Za-zñáéíóúü/-]+)',text)
                tweet_json_cleaned.set_value(index,'new_name',name[0])
                if name2 !=[]:
                    print(name2)
                    tweet_json_cleaned.set_value(index,'dog_count','multiple')
                    
            ## Example: This is a golden retriever named Charlie.      
            elif re.match(r'.*named [A-Z][A-Za-zñáéíóúü/-]+([.,!?\s]|( (and|(&amp;)) [A-Z][A-Za-zñáéíóúü/-]+))',text):
                
                name=re.findall(r'.*named ([A-Z][A-Za-zñáéíóúü/-]+)[.,!?\s]',text)
                name2=re.findall(r'.* (and|(&amp;)) ([A-Z][A-Za-zñáéíóúü/-]+)',text)
    
                tweet_json_cleaned.set_value(index,'new_name',name[0])
                if name2 !=[]:
                    print(name2)
                    tweet_json_cleaned.set_value(index,'dog_count','multiple')
                    
            ## Example: His name is Zoey.          
            elif re.match(r'.*name is [A-Z][A-Za-zñáéíóúü/-]+([.,!?\s]|( (and|(&amp;)) [A-Z][A-Za-zñáéíóúü/-]+))',text):
                
                name=re.findall(r'.*name is ([A-Z][A-Za-zñáéíóúü/-]+)[.,!?\s]',text)
                name2=re.findall(r'.* (and|(&amp;)) ([A-Z][A-Za-zñáéíóúü/-]+)',text)
                
                tweet_json_cleaned.set_value(index,'new_name',name[0])
                if name2 !=[]:
                    print(name2)
                    tweet_json_cleaned.set_value(index,'dog_count','multiple')    
            else:
                tweet_json_cleaned.set_value(index,'new_name',None)

                
[('and', '', 'Hank')]
[('and', '', 'Oliver')]
[('and', '', 'Morple')]
[('and', '', 'Daisy')]
[('and', '', 'Gunner')]
[('and', '', 'Gunner')]
[('&amp;', '&amp;', 'Harold')]
[('and', '', 'Jersey')]
[('and', '', 'Elsa')]
[('&amp;', '&amp;', 'Patti')]
[('&amp;', '&amp;', 'Wilbur')]
[('and', '', 'Harley')]
[('and', '', 'Gary')]
[('and', '', 'Murphy')]
[('and', '', 'Ty')]
[('and', '', 'Charlie')]
[('and', '', 'Ty')]
[('&amp;', '&amp;', 'Lila')]
[('&amp;', '&amp;', 'Honey')]
[('&amp;', '&amp;', 'Jil')]
[('and', '', 'Seaweed')]
[('and', '', 'Thumpelina')]
[('and', '', 'Jupiter')]
[('and', '', 'Millie')]
[('&amp;', '&amp;', 'Murphy')]
[('and', '', 'Sophie')]
[('and', '', 'Flurp')]
[('and', '', 'Clarkus')]
[('and', '', 'Roxy')]
[('and', '', 'Ruby')]
[('and', '', 'Amos')]
[('&amp;', '&amp;', 'Kiwi')]
[('&amp;', '&amp;', 'Wilbur')]
[('&amp;', '&amp;', 'Ruffalo')]
[('and', '', 'Plip')]
[('and', '', 'Taco')]
[('and', '', 'Comcast')]
[('&amp;', '&amp;', 'Charlie')]
[('and', '', 'Izzy')]
[('&amp;', '&amp;', 'Bee')]
[('&amp;', '&amp;', 'Gizmo')]
[('and', '', 'Bush')]
[('and', '', 'Pablo')]
[('&amp;', '&amp;', 'Tonic')]
[('and', '', 'Sully')]
[('&amp;', '&amp;', 'Carson')]
[('and', '', 'Gentleman')]
[('and', '', 'Milo')]
[('and', '', 'Hall')]
[('and', '', 'Twips')]
In [42]:
tweet_json_cleaned[tweet_json_cleaned['new_name'].isnull()].to_csv('new_name.csv')
In [43]:
tweet_json_cleaned['new_name'].value_counts(dropna=False)
Out[43]:
NaN           832
Charlie        12
Cooper         11
Oliver         11
Lucy           11
Lola           10
Penny          10
Tucker         10
Bo              9
Winston         9
Sadie           8
Bailey          7
Toby            7
Buddy           7
Daisy           7
Oscar           6
Leo             6
Milo            6
Rusty           6
Bella           6
Koda            6
Stanley         6
Jax             6
Scout           6
Finn            5
Sammy           5
Oakley          5
Alfie           5
Bentley         5
Sunny           5
             ... 
Maks            1
Bradlay         1
Erik            1
Brandonald      1
Crumpet         1
Eazy-E          1
Mutt            1
Alfonso         1
Thor            1
Grady           1
Adele           1
Harrison        1
Grizz           1
Tino            1
Enchilada       1
Glacier         1
Claude          1
Theo            1
Dale            1
Grizzwald       1
Kloey           1
Arlen           1
Meera           1
Lilah           1
Cora            1
Jaycob          1
Jim             1
Teddy           1
Anthony         1
Striker         1
Name: new_name, Length: 953, dtype: int64

Remove surplus columns and rows

Remove tweets without a jpg_url.

Remove rows with multiple dog_count because the model only considers and predicts one dog.

Remove column "unnamed: 0" "name" and "dog_count"

In [44]:
tweet_json_cleaned=tweet_json_cleaned[tweet_json_cleaned['jpg_url'].notnull()]
print(tweet_json_cleaned.shape)
tweet_json_cleaned=tweet_json_cleaned[tweet_json_cleaned['dog_count']=='single']
print(tweet_json_cleaned.shape)
del tweet_json_cleaned['name']
del tweet_json_cleaned['dog_count']
del tweet_json_cleaned['Unnamed: 0.1']
del tweet_json_cleaned['doggo']
del tweet_json_cleaned['floofer']
del tweet_json_cleaned['puppo']
del tweet_json_cleaned['pupper']
del tweet_json_cleaned['Unnamed: 0.1.1']
del tweet_json_cleaned['Unnamed: 0']
(2075, 37)
(2022, 37)
In [45]:
tweet_json_cleaned.shape
Out[45]:
(2022, 29)
In [46]:
tweet_json_cleaned.sample(3)
Out[46]:
Unnamed: 0 tweet_id in_reply_to_status_id in_reply_to_user_id timestamp source text retweeted_status_id retweeted_status_user_id retweeted_status_timestamp expanded_urls rating_numerator rating_denominator jpg_url img_num p1 p1_conf p1_dog p2 p2_conf p2_dog p3 p3_conf p3_dog favorite_count retweet_count stage breed_predict new_name
1966 1966 673343217010679808 NaN NaN 2015-12-06 03:28:27 Twitter for iPhone IT'S SO SMALL ERMERGERF 11/10 https://t.co/dNUbKOSiWW NaN NaN NaT https://twitter.com/dog_rates/status/673343217010679808/photo/1 11.0 10.0 https://pbs.twimg.com/media/CVgyFSyU4AA9p1e.jpg 1.0 Chihuahua 0.541408 True Italian_greyhound 0.156891 True miniature_pinscher 0.069556 True 1025.0 288.0 None Chihuahua None
1573 1573 687494652870668288 NaN NaN 2016-01-14 04:41:12 Twitter for iPhone This is Marq. He stole this car. 7/10 wtf Marq? https://t.co/MHScqo5l8c NaN NaN NaT https://twitter.com/dog_rates/status/687494652870668288/photo/1 7.0 10.0 https://pbs.twimg.com/media/CYp4vFrVAAEs9AX.jpg 1.0 Rottweiler 0.391471 True miniature_pinscher 0.273595 True Tibetan_mastiff 0.041692 True 2087.0 650.0 None Rottweiler Marq
1336 1336 705223444686888960 NaN NaN 2016-03-03 02:49:06 Twitter for iPhone This is Bode. He's a heavy sleeper. 9/10 https://t.co/YMkxhGWUqv NaN NaN NaT https://twitter.com/dog_rates/status/705223444686888960/photo/1 9.0 10.0 https://pbs.twimg.com/media/Ccl0-HVVAAAf8aK.jpg 1.0 Egyptian_cat 0.090508 False Chesapeake_Bay_retriever 0.077373 True Mexican_hairless 0.049472 True 2796.0 892.0 None Chesapeake_Bay_retriever Bode
In [47]:
tweet_json_cleaned.to_csv('twitter_archive_master.csv')