import requests
import os,re,json
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
pd.set_option("max_colwidth",1000000)
pd.set_option('max_columns', 50)

Gather¶

Create a data frame twitter_archive

url="https://d17h27t6h515a5.cloudfront.net/topher/2017/August/59a4e958_twitter-archive-enhanced/twitter-archive-enhanced.csv"
twitter_archive=pd.read_csv(url)
twitter_archive.head(2)

twitter_archive.shape

(2356, 17)

Create a data frame images

url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)

with open(url.split('/')[-1], mode='wb') as file:
    file.write(response.content)
    
images = pd.read_csv('image-predictions.tsv', sep='\t')
images.head(2)

images.shape

(2075, 12)

Merge twitter_archive and images by tweet_id

archive_images=pd.merge(twitter_archive, images, how='outer',on='tweet_id')
archive_images.shape

(2356, 28)

Search by tweed_id and retrieve favorite_count and retweet_count

import tweepy

consumer_key = 
consumer_secret = 

access_token = 
access_token_secret = 


auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth, wait_on_rate_limit=True)

tweet_list=[]


for tweet_id in archive_images['tweet_id']:
    try:
        tweet_status=api.get_status(tweet_id)._json
        favorite_count = tweet_status['favorite_count']
        retweet_count = tweet_status['retweet_count']
        tweet_list.append({'tweet_id': int(tweet_id),'favorite_count': int(favorite_count),'retweet_count': int(retweet_count)})
    
    except Exception:  
        print("error ids:  " +str(tweet_id))
        tweet_list.append({'tweet_id': int(tweet_id)})

error ids:  888202515573088257
error ids:  873697596434513921
error ids:  869988702071779329
error ids:  861769973181624320
error ids:  842892208864923648
error ids:  802247111496568832
error ids:  775096608509886464

tweet_table = pd.DataFrame(tweet_list, columns = ['tweet_id', 'favorite_count', 'retweet_count'])

tweet_table.to_csv('tweet_table.csv')

tweet_table=pd.read_csv('tweet_table.csv')
tweet_table.head(10)

tweet_table[tweet_table['favorite_count']==0]

tweet_table.describe

<bound method NDFrame.describe of       Unnamed: 0            tweet_id  favorite_count  retweet_count
0              0  892420643555336193         39306.0         8778.0
1              1  892177421306343426         33661.0         6428.0
2              2  891815181378084864         25358.0         4268.0
3              3  891689557279858688         42684.0         8863.0
4              4  891327558926688256         40841.0         9645.0
5              5  891087950875897856         20474.0         3213.0
6              6  890971913173991426         12012.0         2126.0
7              7  890729181411237888         66395.0        19421.0
8              8  890609185150312448         28103.0         4366.0
9              9  890240255349198849         32344.0         7617.0
10            10  890006608113172480         30993.0         7526.0
11            11  889880896479866881         28103.0         5084.0
12            12  889665388333682689         38570.0         8445.0
13            13  889638837579907072         27510.0         4663.0
14            14  889531135344209921         15285.0         2291.0
15            15  889278841981685760         25614.0         5584.0
16            16  888917238123831296         29444.0         4644.0
17            17  888804989199671297         25919.0         4494.0
18            18  888554962724278272         20186.0         3696.0
19            19  888202515573088257             NaN            NaN
20            20  888078434458587136         22053.0         3610.0
21            21  887705289381826560         30577.0         5546.0
22            22  887517139158093824         46755.0        11971.0
23            23  887473957103951883         70023.0        18749.0
24            24  887343217045368832         34077.0        10649.0
25            25  887101392804085760         30932.0         6113.0
26            26  886983233522544640         35643.0         7978.0
27            27  886736880519319552         12237.0         3389.0
28            28  886680336477933568         22701.0         4583.0
29            29  886366144734445568         21398.0         3269.0
...          ...                 ...             ...            ...
2326        2326  666411507551481857           456.0          337.0
2327        2327  666407126856765440           112.0           42.0
2328        2328  666396247373291520           170.0           90.0
2329        2329  666373753744588802           193.0           95.0
2330        2330  666362758909284353           798.0          587.0
2331        2331  666353288456101888           226.0           74.0
2332        2332  666345417576210432           305.0          144.0
2333        2333  666337882303524864           202.0           95.0
2334        2334  666293911632134144           515.0          365.0
2335        2335  666287406224695296           151.0           70.0
2336        2336  666273097616637952           181.0           80.0
2337        2337  666268910803644416           107.0           36.0
2338        2338  666104133288665088         14623.0         6791.0
2339        2339  666102155909144576            80.0           14.0
2340        2340  666099513787052032           159.0           72.0
2341        2341  666094000022159362           165.0           77.0
2342        2342  666082916733198337           119.0           46.0
2343        2343  666073100786774016           329.0          172.0
2344        2344  666071193221509120           153.0           65.0
2345        2345  666063827256086533           490.0          226.0
2346        2346  666058600524156928           116.0           59.0
2347        2347  666057090499244032           302.0          144.0
2348        2348  666055525042405380           447.0          259.0
2349        2349  666051853826850816          1243.0          873.0
2350        2350  666050758794694657           135.0           59.0
2351        2351  666049248165822465           110.0           40.0
2352        2352  666044226329800704           306.0          143.0
2353        2353  666033412701032449           127.0           46.0
2354        2354  666029285002620928           131.0           47.0
2355        2355  666020888022790149          2527.0          527.0

[2356 rows x 4 columns]>

tweet_table['favorite_count'].value_counts()

0.0        174
5161.0       3
1376.0       3
2279.0       3
3573.0       3
3494.0       3
315.0        3
1841.0       3
2024.0       3
1254.0       3
912.0        3
2245.0       3
1067.0       3
467.0        3
447.0        3
2434.0       3
3719.0       3
2889.0       2
17372.0      2
3832.0       2
1709.0       2
2427.0       2
1491.0       2
1173.0       2
538.0        2
28103.0      2
1653.0       2
770.0        2
1753.0       2
193.0        2
          ... 
57784.0      1
7058.0       1
19922.0      1
9018.0       1
3539.0       1
39298.0      1
419.0        1
131.0        1
23098.0      1
2251.0       1
466.0        1
46755.0      1
7051.0       1
306.0        1
39996.0      1
12040.0      1
20952.0      1
8788.0       1
2870.0       1
16393.0      1
2383.0       1
1671.0       1
25090.0      1
2058.0       1
31732.0      1
10593.0      1
14194.0      1
10099.0      1
23379.0      1
117.0        1
Name: favorite_count, Length: 2004, dtype: int64

tweet_json = pd.merge(archive_images, tweet_table, how='outer',on='tweet_id')

tweet_json.head(2)

tweet_json.to_csv('tweet_json.txt')

Assess¶

tweet_json=pd.read_csv('tweet_json.txt')

tweet_json.head(5)

tweet_json.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 32 columns):
Unnamed: 0                    2356 non-null int64
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
jpg_url                       2075 non-null object
img_num                       2075 non-null float64
p1                            2075 non-null object
p1_conf                       2075 non-null float64
p1_dog                        2075 non-null object
p2                            2075 non-null object
p2_conf                       2075 non-null float64
p2_dog                        2075 non-null object
p3                            2075 non-null object
p3_conf                       2075 non-null float64
p3_dog                        2075 non-null object
Unnamed: 0.1                  2356 non-null int64
favorite_count                2349 non-null float64
retweet_count                 2349 non-null float64
dtypes: float64(10), int64(5), object(17)
memory usage: 589.1+ KB

tweet_json.describe()

tweet_json['name'].value_counts(dropna=False)

None           745
a               55
Charlie         12
Cooper          11
Oliver          11
Lucy            11
Lola            10
Penny           10
Tucker          10
Winston          9
Bo               9
Sadie            8
the              8
an               7
Toby             7
Buddy            7
Daisy            7
Bailey           7
Scout            6
Jax              6
Oscar            6
Dave             6
Stanley          6
Leo              6
Rusty            6
Koda             6
Milo             6
Jack             6
Bella            6
Gus              5
              ... 
Bode             1
Meatball         1
Brandi           1
Smiley           1
Chubbs           1
my               1
Lillie           1
Jazzy            1
Snicku           1
Cheesy           1
Pippin           1
Autumn           1
Richie           1
Venti            1
Maya             1
Baron            1
Jaspers          1
Bilbo            1
Kara             1
Bobb             1
Rooney           1
Billl            1
Kobe             1
Stuart           1
Nimbus           1
infuriating      1
Mutt             1
Clybe            1
Dante            1
Margo            1
Name: name, Length: 957, dtype: int64

tweet_json['favorite_count'].isnull().value_counts()

False    2349
True        7
Name: favorite_count, dtype: int64

Quality¶

The type of timestamp is an object rather than time format.
The type of tweet_id and in_reply_to_status_id is numeric rather than string.
The type of in_reply_to_user_id is numeric rather than string. There are two tweet_id appearing in the column in_reply_to_user_id which is for user_id only.
The column source contains the whole tag instead of the text.
There are emoji characters e.g. 🎶 inside the column text.The unicode character may not correctly be presented in other editor.
The type of retweeted_status_id is numeric rather than string.
The type of retweeted_status_user_id is numeric rather than string. There are three tweet_id appearing in the column retweeted_status_user_id which is for user_id only.
The rating_numerator can be any positive integer, similarly the rating_denominator can be any positive integer out of 10. In fact, any number is possible because it is not a strict technical rating. There are a couple of records with a wrong rating because the author parsed the first fraction rather than the last. There are some tweets without a rating but the author mistakenly parsed a rating for it. There are some retweets back and forth upgrading their previous rating, of which those retweets do not attach links to the picture of the dog they are talking about. It's hard to track back that dog and merge these retweets with the previous rating tweets, I'd like to delete the retweets that is discussing the rating without a dog picture.
The name column has meaningless dog name, like 'a', 'an', either the dog name does not exist in the text column or failure to retrieve the dog name. Also, @dog_rates can mispell a dog's name, e.g. "Johm".
The favorite_count column has multiple 0 counts but with large number of retweet count, it's almost impossible except for the 'Like' button is disabled. There are 7 records with missing favorite_count and retweet_count because the status does not exist for that tweet.

Tidiness¶

The column of expanded_urls contains records with multiple links, some records have duplicated links splitted by a comma. The reason is that a grid photo was tweeted, so multiple pictures being recorded, but only one of the sub-photo was use for image prediction.
The columns of doggo floofer pupper puppo are all dog stages, converting these four columns into a stage column should be considered.
There should be a column that prints the final breed prediction by summarizing p1 p2 p3 and p1_dog p2_dog p3_dog

Clean¶

Make a copy of tweet_json.

tweet_json=pd.read_csv(r'tweet_json.csv')
tweet_json_cleaned=tweet_json.copy()

Change the type of several columns¶

Change the type of timestamp and retweeted_status_timestamp

Change the type of tweet_id in_reply_to_status_id in_reply_to_user_id retweeted_status_id retweeted_status_user_id

tweet_json_cleaned['timestamp'] = pd.to_datetime(tweet_json_cleaned['timestamp'],utc=True)
tweet_json_cleaned['retweeted_status_timestamp'] = pd.to_datetime(tweet_json_cleaned['retweeted_status_timestamp'],utc=True)

tweet_json_cleaned['tweet_id'] = tweet_json_cleaned['tweet_id'].astype(int).astype(str)

tweet_json_cleaned['in_reply_to_status_id']=tweet_json_cleaned['in_reply_to_status_id'].apply(lambda x : str(int(x)) if not np.isnan(x) else x)
tweet_json_cleaned['in_reply_to_user_id'] = tweet_json_cleaned['in_reply_to_user_id'].apply(lambda x : str(int(x)) if not np.isnan(x) else x)
tweet_json_cleaned['retweeted_status_id'] = tweet_json_cleaned['retweeted_status_id'].apply(lambda x : str(int(x)) if not np.isnan(x) else x)
tweet_json_cleaned['retweeted_status_user_id'] = tweet_json_cleaned['retweeted_status_user_id'].apply(lambda x : str(int(x)) if not np.isnan(x) else x)
tweet_json_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 33 columns):
Unnamed: 0                    2356 non-null int64
Unnamed: 0.1                  2356 non-null int64
tweet_id                      2356 non-null object
in_reply_to_status_id         78 non-null object
in_reply_to_user_id           78 non-null object
timestamp                     2356 non-null datetime64[ns]
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null object
retweeted_status_user_id      181 non-null object
retweeted_status_timestamp    181 non-null datetime64[ns]
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
jpg_url                       2075 non-null object
img_num                       2075 non-null float64
p1                            2075 non-null object
p1_conf                       2075 non-null float64
p1_dog                        2075 non-null object
p2                            2075 non-null object
p2_conf                       2075 non-null float64
p2_dog                        2075 non-null object
p3                            2075 non-null object
p3_conf                       2075 non-null float64
p3_dog                        2075 non-null object
Unnamed: 0.1.1                2356 non-null int64
favorite_count                2349 non-null float64
retweet_count                 2349 non-null float64
dtypes: datetime64[ns](2), float64(6), int64(5), object(20)
memory usage: 607.5+ KB

Create a stage column¶

Convert doggo floofer pupper puppo into a stage variable.

tweet_json_cleaned['stage']=tweet_json_cleaned['doggo']+tweet_json_cleaned['floofer']+tweet_json_cleaned['pupper']+tweet_json_cleaned['puppo']

def concat_stage(x):
    temp=re.sub('None','',x)
    if temp=="" :
        return 'None'
    else:
        return temp

tweet_json_cleaned['stage']=tweet_json_cleaned['stage'].apply(lambda x : concat_stage(x))

print(tweet_json_cleaned['stage'].value_counts())

None            1976
pupper           245
doggo             83
puppo             29
doggopupper       12
floofer            9
doggofloofer       1
doggopuppo         1
Name: stage, dtype: int64

Extract text from source column¶

tweet_json_cleaned['source']=tweet_json_cleaned['source'].apply(lambda x : BeautifulSoup(x, 'html.parser').get_text())

print(tweet_json_cleaned['source'].value_counts())

Twitter for iPhone     2221
Vine - Make a Scene      91
Twitter Web Client       33
TweetDeck                11
Name: source, dtype: int64

Create a breed prediction column¶

Create a breed_predict column.

for index, row in tweet_json_cleaned.iterrows():
    if row['p1_dog'] == True:
        tweet_json_cleaned.set_value(index,'breed_predict',row['p1'])
        
    elif row['p2_dog'] == True:
        tweet_json_cleaned.set_value(index,'breed_predict',row['p2'])
        
    elif row['p3_dog'] == True:
        tweet_json_cleaned.set_value(index,'breed_predict',row['p3'])


tweet_json_cleaned['breed_predict'].value_counts()

golden_retriever                  173
Labrador_retriever                113
Pembroke                           96
Chihuahua                          95
pug                                65
toy_poodle                         52
chow                               51
Samoyed                            46
Pomeranian                         42
malamute                           34
cocker_spaniel                     34
French_bulldog                     32
Chesapeake_Bay_retriever           31
miniature_pinscher                 26
Cardigan                           23
Eskimo_dog                         22
Staffordshire_bullterrier          22
beagle                             21
German_shepherd                    21
Shih-Tzu                           20
Siberian_husky                     20
Shetland_sheepdog                  19
Rottweiler                         19
kuvasz                             19
Lakeland_terrier                   19
Maltese_dog                        19
Italian_greyhound                  17
basset                             17
West_Highland_white_terrier        16
American_Staffordshire_terrier     16
                                 ... 
Afghan_hound                        4
bluetick                            4
Scottish_deerhound                  4
Tibetan_terrier                     4
Welsh_springer_spaniel              4
giant_schnauzer                     4
Weimaraner                          4
Leonberg                            3
toy_terrier                         3
Irish_water_spaniel                 3
komondor                            3
Greater_Swiss_Mountain_dog          3
briard                              3
Brabancon_griffon                   3
curly-coated_retriever              3
cairn                               3
Australian_terrier                  2
black-and-tan_coonhound             2
groenendael                         2
wire-haired_fox_terrier             2
Appenzeller                         2
Sussex_spaniel                      2
Scotch_terrier                      1
Japanese_spaniel                    1
EntleBucher                         1
silky_terrier                       1
Irish_wolfhound                     1
clumber                             1
standard_schnauzer                  1
Bouvier_des_Flandres                1
Name: breed_predict, Length: 113, dtype: int64

Remake favorite count¶

For those tweets having 0 favorite_count, retrieve favorite_count from the retweet_status instead.

for index,row in tweet_json_cleaned.iterrows():
    if row['favorite_count']==0:
        try:
            tweet_status=api.get_status(int(row['tweet_id']))._json
            favorite_count = tweet_status['retweeted_status']['favorite_count']
            tweet_json_cleaned.set_value(index,'favorite_count',favorite_count)

            
        except Exception:  
            print("error ids:  " +row['tweet_id'])

tweet_json_cleaned['favorite_count'].value_counts(dropna=False)

NaN         7
 1067.0     3
 1376.0     3
 2245.0     3
 912.0      3
 467.0      3
 3494.0     3
 2279.0     3
 5161.0     3
 3719.0     3
 1254.0     3
 2024.0     3
 315.0      3
 2434.0     3
 3573.0     3
 447.0      3
 1841.0     3
 2866.0     2
 3771.0     2
 12913.0    2
 487.0      2
 2869.0     2
 8845.0     2
 3142.0     2
 5704.0     2
 3201.0     2
 6951.0     2
 3687.0     2
 2657.0     2
 2969.0     2
           ..
 2058.0     1
 25090.0    1
 430.0      1
 1114.0     1
 482.0      1
 495.0      1
 780.0      1
 28881.0    1
 7058.0     1
 18548.0    1
 5542.0     1
 39298.0    1
 387.0      1
 131.0      1
 23098.0    1
 325.0      1
 2251.0     1
 466.0      1
 46755.0    1
 7051.0     1
 5295.0     1
 39996.0    1
 19922.0    1
 12040.0    1
 8788.0     1
 2870.0     1
 16393.0    1
 3599.0     1
 1671.0     1
 13684.0    1
Name: favorite_count, Length: 2135, dtype: int64

tweet_json_cleaned[tweet_json_cleaned['favorite_count'].isnull()]

Remake ratings¶

Remake rating_numerator and rating_denominator.

First, change the type of rating_numerator and rating_denominator into float. Create a dog count column labelling whether the row has multiple dogs or single dog.

tweet_json_cleaned['rating_numerator']=tweet_json_cleaned['rating_numerator'].astype(float)
tweet_json_cleaned['rating_denominator']=tweet_json_cleaned['rating_denominator'].astype(float)

for index,row in tweet_json_cleaned.iterrows():
            rating=[]
            rating=re.findall(r'(\d+(\.\d+)?)\/(\d+)',row['text'],flags=0)
            if rating !=[]:
                if len(rating)>1 and rating[0][-1]!='10':
                          tweet_json_cleaned.set_value(index,'dog_count','multiple')
                else:
                          tweet_json_cleaned.set_value(index,'dog_count','single')
                        
                tweet_json_cleaned.set_value(index,'rating_numerator',rating[-1][0])
                tweet_json_cleaned.set_value(index,'rating_denominator',rating[-1][-1])
            else:
                tweet_json_cleaned.set_value(index,'rating_numerator',np.nan)
                tweet_json_cleaned.set_value(index,'rating_denominator',np.nan)

print(tweet_json_cleaned['rating_numerator'].value_counts(dropna=False))
print(tweet_json_cleaned['rating_denominator'].value_counts(dropna=False))

12.00      557
11.00      467
10.00      454
13.00      355
9.00       156
8.00       104
7.00        56
14.00       56
5.00        34
6.00        33
3.00        19
4.00        17
2.00        10
1.00         9
0.00         2
15.00        2
9.75         2
420.00       2
84.00        1
24.00        1
13.50        1
143.00       1
80.00        1
99.00        1
182.00       1
165.00       1
45.00        1
204.00       1
1776.00      1
666.00       1
11.27        1
121.00       1
11.26        1
88.00        1
144.00       1
9.50         1
20.00        1
44.00        1
60.00        1
Name: rating_numerator, dtype: int64
10.0     2340
50.0        2
80.0        2
150.0       1
110.0       1
90.0        1
130.0       1
70.0        1
170.0       1
120.0       1
16.0        1
20.0        1
40.0        1
7.0         1
15.0        1
Name: rating_denominator, dtype: int64

Remake dog names¶

Define several rules to extract dog names from text column. Return the dog names to a new name column.

for index,row in tweet_json_cleaned.iterrows():
            text=row['text']
            ## Example: This is Charlie and Zoey.
            if re.match(r'.*This is [A-Z][A-Za-zñáéíóúü/-]+([.,!?\s]|( (and|(&amp;)) [A-Z][A-Za-zñáéíóúü/-]+))',text):
                
                name=re.findall(r'.*This is ([A-Z][A-Za-zñáéíóúü/-]+)[.,!?\s]',text)
                name2=re.findall(r'.* (and|(&amp;)) ([A-Z][A-Za-zñáéíóúü/-]+)',text)
                tweet_json_cleaned.set_value(index,'new_name',name[0])
            
                if name2 !=[]:
                    print(name2)
                    tweet_json_cleaned.set_value(index,'dog_count','multiple')
                    
            ## Example: Meet Charlie and Zoey.
            elif re.match(r'.*Meet [A-Z][A-Za-zñáéíóúü/-]+([.,!?\s]|( (and|(&amp;)) [A-Z][A-Za-zñáéíóúü/-]+))',text):
                
                name=re.findall(r'.*Meet ([A-Z][A-Za-zñáéíóúü/-]+)[.,!?\s]',text)  
                name2=re.findall(r'.* (and|(&amp;)) ([A-Z][A-Za-zñáéíóúü/-]+)',text)
                tweet_json_cleaned.set_value(index,'new_name',name[0])
                
                if name2 !=[]:
                    print(name2)
                    tweet_json_cleaned.set_value(index,'dog_count','multiple')
                    
            ## Example: Say hello to Charlie and Zoey.   
            elif re.match(r'.*Say hello to [A-Z][A-Za-zñáéíóúü/-]+([.,!?]|( (and|(&amp;)) [A-Z][A-Za-zñáéíóúü/-]+))',text):
                
                name=re.findall(r'.*Say hello to ([A-Z][A-Za-zñáéíóúü/-]+)[.,!?\s]',text)
                name2=re.findall(r'.* (and|(&amp;)) ([A-Z][A-Za-zñáéíóúü/-]+)',text)
                tweet_json_cleaned.set_value(index,'new_name',name[0])
                if name2 !=[]:
                    print(name2)
                    tweet_json_cleaned.set_value(index,'dog_count','multiple')
                    
            ## Example: This is a golden retriever named Charlie.      
            elif re.match(r'.*named [A-Z][A-Za-zñáéíóúü/-]+([.,!?\s]|( (and|(&amp;)) [A-Z][A-Za-zñáéíóúü/-]+))',text):
                
                name=re.findall(r'.*named ([A-Z][A-Za-zñáéíóúü/-]+)[.,!?\s]',text)
                name2=re.findall(r'.* (and|(&amp;)) ([A-Z][A-Za-zñáéíóúü/-]+)',text)
    
                tweet_json_cleaned.set_value(index,'new_name',name[0])
                if name2 !=[]:
                    print(name2)
                    tweet_json_cleaned.set_value(index,'dog_count','multiple')
                    
            ## Example: His name is Zoey.          
            elif re.match(r'.*name is [A-Z][A-Za-zñáéíóúü/-]+([.,!?\s]|( (and|(&amp;)) [A-Z][A-Za-zñáéíóúü/-]+))',text):
                
                name=re.findall(r'.*name is ([A-Z][A-Za-zñáéíóúü/-]+)[.,!?\s]',text)
                name2=re.findall(r'.* (and|(&amp;)) ([A-Z][A-Za-zñáéíóúü/-]+)',text)
                
                tweet_json_cleaned.set_value(index,'new_name',name[0])
                if name2 !=[]:
                    print(name2)
                    tweet_json_cleaned.set_value(index,'dog_count','multiple')    
            else:
                tweet_json_cleaned.set_value(index,'new_name',None)

[('and', '', 'Hank')]
[('and', '', 'Oliver')]
[('and', '', 'Morple')]
[('and', '', 'Daisy')]
[('and', '', 'Gunner')]
[('and', '', 'Gunner')]
[('&amp;', '&amp;', 'Harold')]
[('and', '', 'Jersey')]
[('and', '', 'Elsa')]
[('&amp;', '&amp;', 'Patti')]
[('&amp;', '&amp;', 'Wilbur')]
[('and', '', 'Harley')]
[('and', '', 'Gary')]
[('and', '', 'Murphy')]
[('and', '', 'Ty')]
[('and', '', 'Charlie')]
[('and', '', 'Ty')]
[('&amp;', '&amp;', 'Lila')]
[('&amp;', '&amp;', 'Honey')]
[('&amp;', '&amp;', 'Jil')]
[('and', '', 'Seaweed')]
[('and', '', 'Thumpelina')]
[('and', '', 'Jupiter')]
[('and', '', 'Millie')]
[('&amp;', '&amp;', 'Murphy')]
[('and', '', 'Sophie')]
[('and', '', 'Flurp')]
[('and', '', 'Clarkus')]
[('and', '', 'Roxy')]
[('and', '', 'Ruby')]
[('and', '', 'Amos')]
[('&amp;', '&amp;', 'Kiwi')]
[('&amp;', '&amp;', 'Wilbur')]
[('&amp;', '&amp;', 'Ruffalo')]
[('and', '', 'Plip')]
[('and', '', 'Taco')]
[('and', '', 'Comcast')]
[('&amp;', '&amp;', 'Charlie')]
[('and', '', 'Izzy')]
[('&amp;', '&amp;', 'Bee')]
[('&amp;', '&amp;', 'Gizmo')]
[('and', '', 'Bush')]
[('and', '', 'Pablo')]
[('&amp;', '&amp;', 'Tonic')]
[('and', '', 'Sully')]
[('&amp;', '&amp;', 'Carson')]
[('and', '', 'Gentleman')]
[('and', '', 'Milo')]
[('and', '', 'Hall')]
[('and', '', 'Twips')]

tweet_json_cleaned[tweet_json_cleaned['new_name'].isnull()].to_csv('new_name.csv')

tweet_json_cleaned['new_name'].value_counts(dropna=False)

NaN           832
Charlie        12
Cooper         11
Oliver         11
Lucy           11
Lola           10
Penny          10
Tucker         10
Bo              9
Winston         9
Sadie           8
Bailey          7
Toby            7
Buddy           7
Daisy           7
Oscar           6
Leo             6
Milo            6
Rusty           6
Bella           6
Koda            6
Stanley         6
Jax             6
Scout           6
Finn            5
Sammy           5
Oakley          5
Alfie           5
Bentley         5
Sunny           5
             ... 
Maks            1
Bradlay         1
Erik            1
Brandonald      1
Crumpet         1
Eazy-E          1
Mutt            1
Alfonso         1
Thor            1
Grady           1
Adele           1
Harrison        1
Grizz           1
Tino            1
Enchilada       1
Glacier         1
Claude          1
Theo            1
Dale            1
Grizzwald       1
Kloey           1
Arlen           1
Meera           1
Lilah           1
Cora            1
Jaycob          1
Jim             1
Teddy           1
Anthony         1
Striker         1
Name: new_name, Length: 953, dtype: int64

Remove surplus columns and rows¶

Remove tweets without a jpg_url.

Remove rows with multiple dog_count because the model only considers and predicts one dog.

Remove column "unnamed: 0" "name" and "dog_count"

tweet_json_cleaned=tweet_json_cleaned[tweet_json_cleaned['jpg_url'].notnull()]
print(tweet_json_cleaned.shape)
tweet_json_cleaned=tweet_json_cleaned[tweet_json_cleaned['dog_count']=='single']
print(tweet_json_cleaned.shape)
del tweet_json_cleaned['name']
del tweet_json_cleaned['dog_count']
del tweet_json_cleaned['Unnamed: 0.1']
del tweet_json_cleaned['doggo']
del tweet_json_cleaned['floofer']
del tweet_json_cleaned['puppo']
del tweet_json_cleaned['pupper']
del tweet_json_cleaned['Unnamed: 0.1.1']
del tweet_json_cleaned['Unnamed: 0']

(2075, 37)
(2022, 37)

tweet_json_cleaned.shape

(2022, 29)

tweet_json_cleaned.sample(3)

tweet_json_cleaned.to_csv('twitter_archive_master.csv')

	Unnamed: 0	tweet_id	in_reply_to_status_id	in_reply_to_user_id	retweeted_status_id	retweeted_status_user_id	rating_numerator	rating_denominator	img_num	p1_conf	p2_conf	p3_conf	Unnamed: 0.1	favorite_count	retweet_count
count	2356.000000	2.356000e+03	7.800000e+01	7.800000e+01	1.810000e+02	1.810000e+02	2356.000000	2356.000000	2075.000000	2075.000000	2.075000e+03	2.075000e+03	2356.000000	2349.000000	2349.000000
mean	1177.500000	7.427716e+17	7.455079e+17	2.014171e+16	7.720400e+17	1.241698e+16	13.126486	10.455433	1.203855	0.594548	1.345886e-01	6.032417e-02	1177.500000	8106.075351	3094.855258
std	680.262939	6.856705e+16	7.582492e+16	1.252797e+17	6.236928e+16	9.599254e+16	45.876648	6.745237	0.561875	0.271174	1.006657e-01	5.090593e-02	680.262939	12048.901960	5129.976423
min	0.000000	6.660209e+17	6.658147e+17	1.185634e+07	6.661041e+17	7.832140e+05	0.000000	0.000000	1.000000	0.044333	1.011300e-08	1.740170e-10	0.000000	0.000000	0.000000
25%	588.750000	6.783989e+17	6.757419e+17	3.086374e+08	7.186315e+17	4.196984e+09	10.000000	10.000000	1.000000	0.364412	5.388625e-02	1.622240e-02	588.750000	1414.000000	615.000000
50%	1177.500000	7.196279e+17	7.038708e+17	4.196984e+09	7.804657e+17	4.196984e+09	11.000000	10.000000	1.000000	0.588230	1.181810e-01	4.944380e-02	1177.500000	3585.000000	1444.000000
75%	1766.250000	7.993373e+17	8.257804e+17	4.196984e+09	8.203146e+17	4.196984e+09	12.000000	10.000000	1.000000	0.843855	1.955655e-01	9.180755e-02	1766.250000	10090.000000	3604.000000
max	2355.000000	8.924206e+17	8.862664e+17	8.405479e+17	8.874740e+17	7.874618e+17	1776.000000	170.000000	4.000000	1.000000	4.880140e-01	2.734190e-01	2355.000000	131748.000000	78704.000000

	tweet_id	in_reply_to_status_id	in_reply_to_user_id	timestamp	source	text	retweeted_status_id	retweeted_status_user_id	retweeted_status_timestamp	expanded_urls	rating_numerator	rating_denominator	name	doggo	floofer	pupper	puppo
0	892420643555336193	NaN	NaN	2017-08-01 16:23:56 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU	NaN	NaN	NaN	https://twitter.com/dog_rates/status/892420643555336193/photo/1	13	10	Phineas	None	None	None	None
1	892177421306343426	NaN	NaN	2017-08-01 00:17:27 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https://t.co/0Xxu71qeIV	NaN	NaN	NaN	https://twitter.com/dog_rates/status/892177421306343426/photo/1	13	10	Tilly	None	None	None	None

	tweet_id	jpg_url	img_num	p1	p1_conf	p1_dog	p2	p2_conf	p2_dog	p3	p3_conf	p3_dog
0	666020888022790149	https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg	1	Welsh_springer_spaniel	0.465074	True	collie	0.156665	True	Shetland_sheepdog	0.061428	True
1	666029285002620928	https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg	1	redbone	0.506826	True	miniature_pinscher	0.074192	True	Rhodesian_ridgeback	0.072010	True

	Unnamed: 0	tweet_id	favorite_count	retweet_count
0	0	892420643555336193	39306.0	8778.0
1	1	892177421306343426	33661.0	6428.0
2	2	891815181378084864	25358.0	4268.0
3	3	891689557279858688	42684.0	8863.0
4	4	891327558926688256	40841.0	9645.0
5	5	891087950875897856	20474.0	3213.0
6	6	890971913173991426	12012.0	2126.0
7	7	890729181411237888	66395.0	19421.0
8	8	890609185150312448	28103.0	4366.0
9	9	890240255349198849	32344.0	7617.0

	Unnamed: 0	tweet_id	favorite_count	retweet_count
32	32	886054160059072513	0.0	106.0
36	36	885311592912609280	0.0	19088.0
68	68	879130579576475649	0.0	7079.0
73	73	878404777348136964	0.0	1333.0
74	74	878316110768087041	0.0	6872.0
78	78	877611172832227328	0.0	82.0
91	91	874434818259525634	0.0	15324.0
97	97	873337748698140672	0.0	1642.0
101	101	872668790621863937	0.0	31.0
109	109	871166179821445120	0.0	5925.0
124	124	868639477480148993	0.0	2203.0
130	130	867072653475098625	0.0	133.0
132	132	866816280283807744	0.0	32510.0
137	137	866094527597207552	0.0	8823.0
146	146	863471782782697472	0.0	2636.0
159	159	860981674716409858	0.0	2295.0
160	160	860924035999428608	0.0	874.0
165	165	860177593139703809	0.0	32992.0
171	171	858860390427611136	0.0	8708.0
180	180	857062103051644929	0.0	182.0
182	182	856602993587888130	0.0	11505.0
185	185	856330835276025856	0.0	723.0
194	194	855245323840757760	0.0	6481.0
195	195	855138241867124737	0.0	50.0
204	204	852936405516943360	0.0	2209.0
211	211	851953902622658560	0.0	10586.0
212	212	851861385021730816	0.0	23.0
222	222	849668094696017920	0.0	5461.0
230	230	847978865427394560	0.0	3609.0
231	231	847971574464610304	0.0	474.0
...	...	...	...	...
778	778	775898661951791106	0.0	17403.0
794	794	773336787167145985	0.0	5827.0
800	800	772615324260794368	0.0	3854.0
811	811	771171053431250945	0.0	8600.0
815	815	771004394259247104	0.0	251.0
818	818	770743923962707968	0.0	51801.0
822	822	770093767776997377	0.0	3470.0
826	826	769335591808995329	0.0	8738.0
829	829	768909767477751808	0.0	3087.0
833	833	768554158521745409	0.0	6633.0
841	841	766864461642756096	0.0	6436.0
847	847	766078092750233600	0.0	2941.0
860	860	763167063695355904	0.0	3443.0
868	868	761750502866649088	0.0	4482.0
872	872	761371037149827077	0.0	20284.0
885	885	760153949710192640	0.0	38.0
890	890	759566828574212096	0.0	24037.0
895	895	759159934323924993	0.0	1336.0
908	908	757729163776290825	0.0	9197.0
911	911	757597904299253760	0.0	329.0
926	926	754874841593970688	0.0	9081.0
937	937	753298634498793472	0.0	6538.0
943	943	752701944171524096	0.0	3255.0
949	949	752309394570878976	0.0	18801.0
1012	1012	747242308580548608	0.0	3229.0
1023	1023	746521445350707200	0.0	1096.0
1043	1043	743835915802583040	0.0	2355.0
1242	1242	711998809858043904	0.0	138.0
2259	2259	667550904950915073	0.0	37.0
2260	2260	667550882905632768	0.0	34.0

	Unnamed: 0	Unnamed: 0.1	tweet_id	in_reply_to_status_id	in_reply_to_user_id	timestamp	source	text	retweeted_status_id	retweeted_status_user_id	retweeted_status_timestamp	expanded_urls	rating_numerator	rating_denominator	name	doggo	floofer	pupper	puppo	jpg_url	img_num	p1	p1_conf	p1_dog	p2	p2_conf	p2_dog	p3	p3_conf	p3_dog	Unnamed: 0.1.1	favorite_count	retweet_count	stage	breed_predict
19	19	19	888202515573088257	NaN	NaN	2017-07-21 01:02:36	Twitter for iPhone	RT @dog_rates: This is Canela. She attempted some fancy porch pics. They were unsuccessful. 13/10 someone help her https://t.co/cLyzpcUcMX	887473957103951872	4196983835	2017-07-19 00:47:34	https://twitter.com/dog_rates/status/887473957103951883/photo/1,https://twitter.com/dog_rates/status/887473957103951883/photo/1,https://twitter.com/dog_rates/status/887473957103951883/photo/1,https://twitter.com/dog_rates/status/887473957103951883/photo/1	13	10	Canela	None	None	None	None	https://pbs.twimg.com/media/DFDw2tyUQAAAFke.jpg	2.0	Pembroke	0.809197	True	Rhodesian_ridgeback	0.054950	True	beagle	0.038915	True	19	NaN	NaN	None	Pembroke
95	95	95	873697596434513921	NaN	NaN	2017-06-11 00:25:14	Twitter for iPhone	RT @dog_rates: This is Walter. He won't start hydrotherapy without his favorite floatie. 14/10 keep it pup Walter https://t.co/r28jFx9uyF	868880397819494400	4196983835	2017-05-28 17:23:24	https://twitter.com/dog_rates/status/868880397819494401/photo/1,https://twitter.com/dog_rates/status/868880397819494401/photo/1	14	10	Walter	None	None	None	None	https://pbs.twimg.com/media/DA7iHL5U0AA1OQo.jpg	1.0	laptop	0.153718	False	French_bulldog	0.099984	True	printer	0.077130	False	95	NaN	NaN	None	French_bulldog
118	118	118	869988702071779329	NaN	NaN	2017-05-31 18:47:24	Twitter for iPhone	RT @dog_rates: We only rate dogs. This is quite clearly a smol broken polar bear. We'd appreciate if you only send dogs. Thank you... 12/10…	859196978902773760	4196983835	2017-05-02 00:04:57	https://twitter.com/dog_rates/status/859196978902773760/video/1	12	10	quite	None	None	None	None	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	118	NaN	NaN	None	NaN
155	155	155	861769973181624320	NaN	NaN	2017-05-09 02:29:07	Twitter for iPhone	RT @dog_rates: "Good afternoon class today we're going to learn what makes a good boy so good" 13/10 https://t.co/f1h2Fsalv9	806629075125202944	4196983835	2016-12-07 22:38:52	https://twitter.com/dog_rates/status/806629075125202948/photo/1,https://twitter.com/dog_rates/status/806629075125202948/photo/1,https://twitter.com/dog_rates/status/806629075125202948/photo/1,https://twitter.com/dog_rates/status/806629075125202948/photo/1	13	10	None	None	None	None	None	https://pbs.twimg.com/media/CzG425nWgAAnP7P.jpg	2.0	Arabian_camel	0.366248	False	house_finch	0.209852	False	cocker_spaniel	0.046403	True	155	NaN	NaN	None	cocker_spaniel
260	260	260	842892208864923648	NaN	NaN	2017-03-18 00:15:37	Twitter for iPhone	RT @dog_rates: This is Stephan. He just wants to help. 13/10 such a good boy https://t.co/DkBYaCAg2d	807106840509214720	4196983835	2016-12-09 06:17:20	https://twitter.com/dog_rates/status/807106840509214720/video/1,https://twitter.com/dog_rates/status/807106840509214720/video/1	13	10	Stephan	None	None	None	None	https://pbs.twimg.com/ext_tw_video_thumb/807106774843039744/pu/img/8XZg1xW35Xp2J6JW.jpg	1.0	Chihuahua	0.505370	True	Pomeranian	0.120358	True	toy_terrier	0.077008	True	260	NaN	NaN	None	Chihuahua
566	566	566	802247111496568832	NaN	NaN	2016-11-25 20:26:31	Twitter for iPhone	RT @dog_rates: Everybody drop what you're doing and look at this dog. 13/10 must be super h*ckin rare https://t.co/I1bJUzUEW5	779056095788752896	4196983835	2016-09-22 20:33:42	https://twitter.com/dog_rates/status/779056095788752897/photo/1,https://twitter.com/dog_rates/status/779056095788752897/photo/1,https://twitter.com/dog_rates/status/779056095788752897/photo/1,https://twitter.com/dog_rates/status/779056095788752897/photo/1	13	10	None	None	None	None	None	https://pbs.twimg.com/media/Cs_DYr1XEAA54Pu.jpg	1.0	Chihuahua	0.721188	True	toy_terrier	0.112943	True	kelpie	0.053365	True	566	NaN	NaN	None	Chihuahua
784	784	784	775096608509886464	NaN	NaN	2016-09-11 22:20:06	Twitter for iPhone	RT @dog_rates: After so many requests, this is Bretagne. She was the last surviving 9/11 search dog, and our second ever 14/10. RIP https:/…	740373189193256960	4196983835	2016-06-08 02:41:38	https://twitter.com/dog_rates/status/740373189193256964/photo/1,https://twitter.com/dog_rates/status/740373189193256964/photo/1,https://twitter.com/dog_rates/status/740373189193256964/photo/1,https://twitter.com/dog_rates/status/740373189193256964/photo/1	9	11	None	None	None	None	None	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	784	NaN	NaN	None	NaN

	Unnamed: 0	tweet_id	in_reply_to_status_id	in_reply_to_user_id	timestamp	source	text	retweeted_status_id	retweeted_status_user_id	retweeted_status_timestamp	expanded_urls	rating_numerator	rating_denominator	jpg_url	img_num	p1	p1_conf	p1_dog	p2	p2_conf	p2_dog	p3	p3_conf	p3_dog	favorite_count	retweet_count	stage	breed_predict	new_name
1966	1966	673343217010679808	NaN	NaN	2015-12-06 03:28:27	Twitter for iPhone	IT'S SO SMALL ERMERGERF 11/10 https://t.co/dNUbKOSiWW	NaN	NaN	NaT	https://twitter.com/dog_rates/status/673343217010679808/photo/1	11.0	10.0	https://pbs.twimg.com/media/CVgyFSyU4AA9p1e.jpg	1.0	Chihuahua	0.541408	True	Italian_greyhound	0.156891	True	miniature_pinscher	0.069556	True	1025.0	288.0	None	Chihuahua	None
1573	1573	687494652870668288	NaN	NaN	2016-01-14 04:41:12	Twitter for iPhone	This is Marq. He stole this car. 7/10 wtf Marq? https://t.co/MHScqo5l8c	NaN	NaN	NaT	https://twitter.com/dog_rates/status/687494652870668288/photo/1	7.0	10.0	https://pbs.twimg.com/media/CYp4vFrVAAEs9AX.jpg	1.0	Rottweiler	0.391471	True	miniature_pinscher	0.273595	True	Tibetan_mastiff	0.041692	True	2087.0	650.0	None	Rottweiler	Marq
1336	1336	705223444686888960	NaN	NaN	2016-03-03 02:49:06	Twitter for iPhone	This is Bode. He's a heavy sleeper. 9/10 https://t.co/YMkxhGWUqv	NaN	NaN	NaT	https://twitter.com/dog_rates/status/705223444686888960/photo/1	9.0	10.0	https://pbs.twimg.com/media/Ccl0-HVVAAAf8aK.jpg	1.0	Egyptian_cat	0.090508	False	Chesapeake_Bay_retriever	0.077373	True	Mexican_hairless	0.049472	True	2796.0	892.0	None	Chesapeake_Bay_retriever	Bode