-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpreprocessing.py
More file actions
241 lines (212 loc) · 9.42 KB
/
preprocessing.py
File metadata and controls
241 lines (212 loc) · 9.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
from __future__ import division
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import json
import string
import os
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
stop_words = list(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
puncts = re.compile('[%s]' % re.escape(string.punctuation))
class Cleaner():
def __init__(self, folder='tweets'):
self.folder=folder
# Make the list of all available usernames
self.users = os.listdir(folder)
# Make folder for storing cleaned data
if not os.path.exists(folder+'/cleaned_data/'):
os.mkdir(folder+'/cleaned_data/')
def clean_data(self, username, remove_hashtags=True, remove_mentions=True, remove_stopwords=True):
# Read the CSV File. Helps if the CSV file is messed up
with open(self.folder+'/'+username, 'r', encoding='utf-8') as f:
data = f.readlines()[1:]
for i in range(len(data)):
data[i] = data[i].strip('\n')
data[i] = ','.join(data[i].split(',')[2:])
# Preprocessing start
for i in range(len(data)):
temp = data[i]
temp = re.sub(r'&[a-zA-Z0-9]+;?', "", temp) # Removes words like , & , etc
temp = re.sub('RT', "", temp) # Removes 'RT'
temp = re.sub("'ld", ' would', temp) # Preprocess chat language
temp = re.sub("'d", " had", temp)
temp = re.sub("'ve", " have", temp)
temp = re.sub("'m", " am", temp)
temp = re.sub("n't", " not", temp)
temp = re.sub("won't", "would not", temp)
temp = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', "", temp) # Replaces all urls
if remove_mentions:
temp = re.sub("@[A-Za-z0-9_]+","",temp) # Remove all mentions
if remove_hashtags:
temp = re.sub('#[A-Za-z0-9_]+', "", temp) # Remove all hashtags
temp = puncts.sub("", temp) # Remove punctuations
temp = temp.encode('ascii', 'ignore').decode('ascii') # Remove emojis and other junk
temp = re.sub('[0-9]+[a-zA-Z]+', '<UNIT>', temp) # Replaces words like 15ft, 12cm, 5k with <UNIT>
temp = re.sub('[0-9]+', '<NUMBER>', temp) # Replaces all numbers with '<NUMBER>'
temp = temp.lower() # Converts the data to lowercase
temp = re.sub(' u ', ' you ', temp)
data[i] = temp
cleaned_list = []
tokens = word_tokenize(data[i]) # Tokenize data
if remove_stopwords: # Removes Stopwords
for j in tokens:
if j not in stop_words:
cleaned_list.append(lemmatizer.lemmatize(j)) # Lemmatize. Can also try stemming
else: # Doesn't remove stopwords
for j in tokens:
cleaned_list.append(lemmatizer.lemmatize(j)) # Lemmatize. Can also try Stemming
data[i] = ' '.join(cleaned_list)
cleaned_df= pd.DataFrame({'tweets':data}) #Create a personal CSV for all users
cleaned_df.to_csv(self.folder+'/cleaned_data/'+username, index=False)
def clean(self):
print('Cleaning the data')
for username in tqdm(self.users):
try:
self.clean_data(username)
except:
pass
print('Done cleaning the data')
class Analytics():
def __init__(self, folder='tweets'):
self.folder=folder
# Make a folder to store per user insights
if not os.path.exists(folder+'/users/'):
os.mkdir(folder+'/users/')
if not os.path.exists(folder+'/analytics/'):
os.mkdir(folder+'/analytics/')
def analyse_user(self, username):
# Analyse for each user
data = pd.read_csv(self.folder+'/cleaned_data/'+username)['tweets'].tolist()
max_word_counts = []
for tweet in data:
try:
max_word_counts.append(len(word_tokenize(tweet)))
except TypeError: # Occurs if string is empty
max_word_counts.append(0)
max_word_counts = sorted(max_word_counts)
average = sum(max_word_counts)/len(max_word_counts)
return max_word_counts, average
def plot_data(self, username, save=True):
# Plot data
max_word_counts, average = self.analyse_user(username)
plt.bar(list(range(1,len(max_word_counts)+1)), max_word_counts)
plt.xlabel("Tweet Number")
plt.ylabel("Maximum number of words")
plt.title("Maximum words per tweet for {}".format(username.strip('.csv')))
if save:
plt.savefig(self.folder+'/users/'+username.strip('.csv')+'.png')
else:
plt.show()
def analyse(self, save=True):
print('Calculating the analysis of cleaned data')
l = os.listdir(self.folder+'/cleaned_data/')
max_words = []
averages = []
for username in tqdm(l):
try:
analytics = {}
analytics['name'] = username.strip('.csv')
analytics['keywords'] = self.extract_keywords(username)
try:
maximum, average = self.analyse_user(username)
except ZeroDivisionError:
maximum, average = [0], 0
analytics['max_number_of_words'] = max(maximum)
analytics['average'] = average
with open(self.folder+'/users/'+username.strip('.csv')+'.json', 'w') as f:
json.dump(analytics, f)
max_words.append(analytics['max_number_of_words'])
averages.append(average)
except IsADirectoryError:
pass
print('Done calculating the analytics')
ax = plt.subplot(111)
ax.bar(list(range(1,len(max_words)+1)), max_words, color='b', align='center', label='Maximum Words per User: Maximum = {}'.format(max(max_words)))
ax.bar(list(range(1,len(averages)+1)), averages, color='g', align='center', label='Average Words per User: Maximum = {}'.format(int(max(averages))+1))
ax.autoscale(tight=True)
ax.legend(loc='upper left')
if save:
print("Saving...")
plt.savefig(self.folder+'/analytics/analytics.png')
else:
plt.show()
print("Done")
def extract_keywords(self, username, max_features=5):
'''Extracts keywords from the tweets and determines the most tweeted topic by the particular user'''
try:
corpus = pd.read_csv(self.folder+'/cleaned_data/'+username)['tweets']
corpus.dropna(inplace=True)
corpus = corpus.tolist()
vectorizer = TfidfVectorizer(max_features=max_features)
_ = vectorizer.fit_transform(corpus)
except ValueError:
return []
return vectorizer.get_feature_names()
class Numoftweets():
def __init__(self, folder='tweets'):
self.folder=folder
# Make a folder to store per user insights
if not os.path.exists(folder+'/users/'):
os.mkdir(folder+'/users/')
def user_data(self,username):
data = pd.read_csv(self.folder+'/cleaned_data/'+username)['tweets'].dropna().tolist()
no_tweets=len(data)
c=0
for tweet in data:
x=tweet.split()
c+=len(x)
return c,no_tweets
def plot_userdata(self,username='plot', save=True):
# Plot data
count=[]
numtweets=[]
for filename in os.listdir('tweets/cleaned_data/'):
c, no_tweets = self.user_data(filename)
count.append(c)
numtweets.append(no_tweets)
count.sort()
plt.bar(list(range(1,len(count)+1)), count)
plt.xlabel("user Number")
plt.ylabel("number of words")
plt.title("words per user")
if save:
plt.savefig('plot'+'.png')
else:
plt.show()
def plot_userdataavg(self,username='plot1', save=True):
# Plot data
count=[]
numtweets=[]
for filename in os.listdir('tweets/cleaned_data/'):
c, no_tweets = self.user_data(filename)
count.append(c)
numtweets.append(no_tweets)
count_avg_temp=np.array(count) / np.array(numtweets)
count_avg=count_avg_temp.tolist()
#count_avg.sort()
plt.bar(list(range(1,len(count_avg)+1)), count_avg)
plt.xlabel("User Number")
plt.ylabel("number of words")
plt.title("average words per user per tweet")
if save:
plt.savefig('plot1'+'.png')
else:
plt.show()
def totalavg(self):
count=[]
numtweets=[]
for filename in os.listdir('tweets/cleaned_data/'):
c, no_tweets = self.user_data(filename)
count.append(c)
numtweets.append(no_tweets)
count=np.array(c)
count_total=np.sum(c)
tweet=np.array(no_tweets)
total_tweet=np.sum(tweet)
return (count_total/total_tweet)