import requests
import os,re,json
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
pd.set_option("max_colwidth",1000000)
pd.set_option('max_columns', 50)
Create a data frame twitter_archive
url="https://d17h27t6h515a5.cloudfront.net/topher/2017/August/59a4e958_twitter-archive-enhanced/twitter-archive-enhanced.csv"
twitter_archive=pd.read_csv(url)
twitter_archive.head(2)
twitter_archive.shape
Create a data frame images
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)
with open(url.split('/')[-1], mode='wb') as file:
file.write(response.content)
images = pd.read_csv('image-predictions.tsv', sep='\t')
images.head(2)
images.shape
Merge twitter_archive and images by tweet_id
archive_images=pd.merge(twitter_archive, images, how='outer',on='tweet_id')
archive_images.shape
Search by tweed_id and retrieve favorite_count and retweet_count
import tweepy
consumer_key =
consumer_secret =
access_token =
access_token_secret =
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)
tweet_list=[]
for tweet_id in archive_images['tweet_id']:
try:
tweet_status=api.get_status(tweet_id)._json
favorite_count = tweet_status['favorite_count']
retweet_count = tweet_status['retweet_count']
tweet_list.append({'tweet_id': int(tweet_id),'favorite_count': int(favorite_count),'retweet_count': int(retweet_count)})
except Exception:
print("error ids: " +str(tweet_id))
tweet_list.append({'tweet_id': int(tweet_id)})
tweet_table = pd.DataFrame(tweet_list, columns = ['tweet_id', 'favorite_count', 'retweet_count'])
tweet_table.to_csv('tweet_table.csv')
tweet_table=pd.read_csv('tweet_table.csv')
tweet_table.head(10)
tweet_table[tweet_table['favorite_count']==0]
tweet_table.describe
tweet_table['favorite_count'].value_counts()
tweet_json = pd.merge(archive_images, tweet_table, how='outer',on='tweet_id')
tweet_json.head(2)
tweet_json.to_csv('tweet_json.txt')
tweet_json=pd.read_csv('tweet_json.txt')
tweet_json.head(5)
tweet_json.info()
tweet_json.describe()
tweet_json['name'].value_counts(dropna=False)
tweet_json['favorite_count'].isnull().value_counts()
The type of timestamp is an object rather than time format.
The type of tweet_id and in_reply_to_status_id is numeric rather than string.
The type of in_reply_to_user_id is numeric rather than string. There are two tweet_id appearing in the column in_reply_to_user_id which is for user_id only.
The column source contains the whole tag instead of the text.
There are emoji characters e.g. 🎶 inside the column text.The unicode character may not correctly be presented in other editor.
The type of retweeted_status_id is numeric rather than string.
The type of retweeted_status_user_id is numeric rather than string. There are three tweet_id appearing in the column retweeted_status_user_id which is for user_id only.
The rating_numerator can be any positive integer, similarly the rating_denominator can be any positive integer out of 10. In fact, any number is possible because it is not a strict technical rating. There are a couple of records with a wrong rating because the author parsed the first fraction rather than the last. There are some tweets without a rating but the author mistakenly parsed a rating for it. There are some retweets back and forth upgrading their previous rating, of which those retweets do not attach links to the picture of the dog they are talking about. It's hard to track back that dog and merge these retweets with the previous rating tweets, I'd like to delete the retweets that is discussing the rating without a dog picture.
The name column has meaningless dog name, like 'a', 'an', either the dog name does not exist in the text column or failure to retrieve the dog name. Also, @dog_rates can mispell a dog's name, e.g. "Johm".
The favorite_count column has multiple 0 counts but with large number of retweet count, it's almost impossible except for the 'Like' button is disabled. There are 7 records with missing favorite_count and retweet_count because the status does not exist for that tweet.
The column of expanded_urls contains records with multiple links, some records have duplicated links splitted by a comma. The reason is that a grid photo was tweeted, so multiple pictures being recorded, but only one of the sub-photo was use for image prediction.
The columns of doggo floofer pupper puppo are all dog stages, converting these four columns into a stage column should be considered.
Make a copy of tweet_json.
tweet_json=pd.read_csv(r'tweet_json.csv')
tweet_json_cleaned=tweet_json.copy()
Change the type of timestamp and retweeted_status_timestamp
Change the type of tweet_id in_reply_to_status_id in_reply_to_user_id retweeted_status_id retweeted_status_user_id
tweet_json_cleaned['timestamp'] = pd.to_datetime(tweet_json_cleaned['timestamp'],utc=True)
tweet_json_cleaned['retweeted_status_timestamp'] = pd.to_datetime(tweet_json_cleaned['retweeted_status_timestamp'],utc=True)
tweet_json_cleaned['tweet_id'] = tweet_json_cleaned['tweet_id'].astype(int).astype(str)
tweet_json_cleaned['in_reply_to_status_id']=tweet_json_cleaned['in_reply_to_status_id'].apply(lambda x : str(int(x)) if not np.isnan(x) else x)
tweet_json_cleaned['in_reply_to_user_id'] = tweet_json_cleaned['in_reply_to_user_id'].apply(lambda x : str(int(x)) if not np.isnan(x) else x)
tweet_json_cleaned['retweeted_status_id'] = tweet_json_cleaned['retweeted_status_id'].apply(lambda x : str(int(x)) if not np.isnan(x) else x)
tweet_json_cleaned['retweeted_status_user_id'] = tweet_json_cleaned['retweeted_status_user_id'].apply(lambda x : str(int(x)) if not np.isnan(x) else x)
tweet_json_cleaned.info()
Convert doggo floofer pupper puppo into a stage variable.
tweet_json_cleaned['stage']=tweet_json_cleaned['doggo']+tweet_json_cleaned['floofer']+tweet_json_cleaned['pupper']+tweet_json_cleaned['puppo']
def concat_stage(x):
temp=re.sub('None','',x)
if temp=="" :
return 'None'
else:
return temp
tweet_json_cleaned['stage']=tweet_json_cleaned['stage'].apply(lambda x : concat_stage(x))
print(tweet_json_cleaned['stage'].value_counts())
tweet_json_cleaned['source']=tweet_json_cleaned['source'].apply(lambda x : BeautifulSoup(x, 'html.parser').get_text())
print(tweet_json_cleaned['source'].value_counts())
Create a breed_predict column.
for index, row in tweet_json_cleaned.iterrows():
if row['p1_dog'] == True:
tweet_json_cleaned.set_value(index,'breed_predict',row['p1'])
elif row['p2_dog'] == True:
tweet_json_cleaned.set_value(index,'breed_predict',row['p2'])
elif row['p3_dog'] == True:
tweet_json_cleaned.set_value(index,'breed_predict',row['p3'])
tweet_json_cleaned['breed_predict'].value_counts()
For those tweets having 0 favorite_count, retrieve favorite_count from the retweet_status instead.
for index,row in tweet_json_cleaned.iterrows():
if row['favorite_count']==0:
try:
tweet_status=api.get_status(int(row['tweet_id']))._json
favorite_count = tweet_status['retweeted_status']['favorite_count']
tweet_json_cleaned.set_value(index,'favorite_count',favorite_count)
except Exception:
print("error ids: " +row['tweet_id'])
tweet_json_cleaned['favorite_count'].value_counts(dropna=False)
tweet_json_cleaned[tweet_json_cleaned['favorite_count'].isnull()]
Remake rating_numerator and rating_denominator.
First, change the type of rating_numerator and rating_denominator into float. Create a dog count column labelling whether the row has multiple dogs or single dog.
tweet_json_cleaned['rating_numerator']=tweet_json_cleaned['rating_numerator'].astype(float)
tweet_json_cleaned['rating_denominator']=tweet_json_cleaned['rating_denominator'].astype(float)
for index,row in tweet_json_cleaned.iterrows():
rating=[]
rating=re.findall(r'(\d+(\.\d+)?)\/(\d+)',row['text'],flags=0)
if rating !=[]:
if len(rating)>1 and rating[0][-1]!='10':
tweet_json_cleaned.set_value(index,'dog_count','multiple')
else:
tweet_json_cleaned.set_value(index,'dog_count','single')
tweet_json_cleaned.set_value(index,'rating_numerator',rating[-1][0])
tweet_json_cleaned.set_value(index,'rating_denominator',rating[-1][-1])
else:
tweet_json_cleaned.set_value(index,'rating_numerator',np.nan)
tweet_json_cleaned.set_value(index,'rating_denominator',np.nan)
print(tweet_json_cleaned['rating_numerator'].value_counts(dropna=False))
print(tweet_json_cleaned['rating_denominator'].value_counts(dropna=False))
Define several rules to extract dog names from text column. Return the dog names to a new name column.
for index,row in tweet_json_cleaned.iterrows():
text=row['text']
## Example: This is Charlie and Zoey.
if re.match(r'.*This is [A-Z][A-Za-zñáéíóúü/-]+([.,!?\s]|( (and|(&)) [A-Z][A-Za-zñáéíóúü/-]+))',text):
name=re.findall(r'.*This is ([A-Z][A-Za-zñáéíóúü/-]+)[.,!?\s]',text)
name2=re.findall(r'.* (and|(&)) ([A-Z][A-Za-zñáéíóúü/-]+)',text)
tweet_json_cleaned.set_value(index,'new_name',name[0])
if name2 !=[]:
print(name2)
tweet_json_cleaned.set_value(index,'dog_count','multiple')
## Example: Meet Charlie and Zoey.
elif re.match(r'.*Meet [A-Z][A-Za-zñáéíóúü/-]+([.,!?\s]|( (and|(&)) [A-Z][A-Za-zñáéíóúü/-]+))',text):
name=re.findall(r'.*Meet ([A-Z][A-Za-zñáéíóúü/-]+)[.,!?\s]',text)
name2=re.findall(r'.* (and|(&)) ([A-Z][A-Za-zñáéíóúü/-]+)',text)
tweet_json_cleaned.set_value(index,'new_name',name[0])
if name2 !=[]:
print(name2)
tweet_json_cleaned.set_value(index,'dog_count','multiple')
## Example: Say hello to Charlie and Zoey.
elif re.match(r'.*Say hello to [A-Z][A-Za-zñáéíóúü/-]+([.,!?]|( (and|(&)) [A-Z][A-Za-zñáéíóúü/-]+))',text):
name=re.findall(r'.*Say hello to ([A-Z][A-Za-zñáéíóúü/-]+)[.,!?\s]',text)
name2=re.findall(r'.* (and|(&)) ([A-Z][A-Za-zñáéíóúü/-]+)',text)
tweet_json_cleaned.set_value(index,'new_name',name[0])
if name2 !=[]:
print(name2)
tweet_json_cleaned.set_value(index,'dog_count','multiple')
## Example: This is a golden retriever named Charlie.
elif re.match(r'.*named [A-Z][A-Za-zñáéíóúü/-]+([.,!?\s]|( (and|(&)) [A-Z][A-Za-zñáéíóúü/-]+))',text):
name=re.findall(r'.*named ([A-Z][A-Za-zñáéíóúü/-]+)[.,!?\s]',text)
name2=re.findall(r'.* (and|(&)) ([A-Z][A-Za-zñáéíóúü/-]+)',text)
tweet_json_cleaned.set_value(index,'new_name',name[0])
if name2 !=[]:
print(name2)
tweet_json_cleaned.set_value(index,'dog_count','multiple')
## Example: His name is Zoey.
elif re.match(r'.*name is [A-Z][A-Za-zñáéíóúü/-]+([.,!?\s]|( (and|(&)) [A-Z][A-Za-zñáéíóúü/-]+))',text):
name=re.findall(r'.*name is ([A-Z][A-Za-zñáéíóúü/-]+)[.,!?\s]',text)
name2=re.findall(r'.* (and|(&)) ([A-Z][A-Za-zñáéíóúü/-]+)',text)
tweet_json_cleaned.set_value(index,'new_name',name[0])
if name2 !=[]:
print(name2)
tweet_json_cleaned.set_value(index,'dog_count','multiple')
else:
tweet_json_cleaned.set_value(index,'new_name',None)
tweet_json_cleaned[tweet_json_cleaned['new_name'].isnull()].to_csv('new_name.csv')
tweet_json_cleaned['new_name'].value_counts(dropna=False)
Remove tweets without a jpg_url.
Remove rows with multiple dog_count because the model only considers and predicts one dog.
Remove column "unnamed: 0" "name" and "dog_count"
tweet_json_cleaned=tweet_json_cleaned[tweet_json_cleaned['jpg_url'].notnull()]
print(tweet_json_cleaned.shape)
tweet_json_cleaned=tweet_json_cleaned[tweet_json_cleaned['dog_count']=='single']
print(tweet_json_cleaned.shape)
del tweet_json_cleaned['name']
del tweet_json_cleaned['dog_count']
del tweet_json_cleaned['Unnamed: 0.1']
del tweet_json_cleaned['doggo']
del tweet_json_cleaned['floofer']
del tweet_json_cleaned['puppo']
del tweet_json_cleaned['pupper']
del tweet_json_cleaned['Unnamed: 0.1.1']
del tweet_json_cleaned['Unnamed: 0']
tweet_json_cleaned.shape
tweet_json_cleaned.sample(3)
tweet_json_cleaned.to_csv('twitter_archive_master.csv')