Skip to content

Commit a9345e6

Browse files
main file
1 parent 6a24d44 commit a9345e6

File tree

1 file changed

+395
-0
lines changed

1 file changed

+395
-0
lines changed

twitterapiaccount.py

Lines changed: 395 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,395 @@
1+
# -*- coding: utf-8 -*-
2+
"""twitterapiaccount.ipynb
3+
4+
Automatically generated by Colaboratory.
5+
6+
Original file is located at
7+
https://colab.research.google.com/drive/1q7pmP3GtNueW7iA4mVutlMdT7BcFZKJR
8+
"""
9+
10+
from google.colab import drive
11+
drive.mount('/content/drive')
12+
13+
!pip install better_profanity
14+
import nltk
15+
16+
nltk.download('punkt')
17+
nltk.download('stopwords')
18+
nltk.download('wordnet')
19+
nltk.download('omw-1.4')
20+
21+
22+
23+
RAW_INPUT_TRAINING_DATA = "/content/drive/MyDrive/twitterapiaccount/dataset_combined_2510_new.csv"
24+
25+
26+
27+
USER_TWEET_DATA_FILE = "user_tweets.csv"
28+
PREPROCESSED_INPUT_TRAINING_DATA = "preprocessed_input_data.csv"
29+
MODEL_FILE = "model.pkl"
30+
VECTORIZER_FILE = "vectorizer.pkl"
31+
32+
# Commented out IPython magic to ensure Python compatibility.
33+
34+
#DATA CLEANING: Vectorizer AND NLP
35+
36+
import pandas as pd
37+
import numpy as np
38+
import matplotlib.pyplot as plt
39+
# %matplotlib inline
40+
41+
import re
42+
from sklearn.feature_extraction.text import TfidfVectorizer
43+
44+
#importing nlp packages
45+
from nltk import stem
46+
from nltk.corpus import stopwords
47+
stemmer = stem.SnowballStemmer('english')
48+
stopwords = set(stopwords.words('english'))
49+
50+
51+
#removing the special characters and numbers and url
52+
def keep_alpha(s):
53+
# s = row['content']
54+
non_url = re.sub(r"http\S+", "", s)
55+
res = re.sub('[^a-zA-Z\s]', '', non_url)
56+
res1 = re.sub('\n', '', res)
57+
return res1
58+
59+
def nlp_preprocessing(msg):
60+
try:
61+
# converting messages to lowercase
62+
msg = msg.lower()
63+
# removing stopwords
64+
msg = [word for word in msg.split() if word not in stopwords]
65+
# using a stemmer (getting root form of each word of each row)
66+
msg = " ".join([stemmer.stem(word) for word in msg])
67+
68+
except Exception as e:
69+
print(e)
70+
71+
return msg
72+
73+
74+
df=pd.read_csv(RAW_INPUT_TRAINING_DATA)
75+
df.rename(columns = {'Text':'tweet'}, inplace = True)
76+
df = df.dropna()
77+
df = df.sample(frac=1).reset_index()
78+
#df = df.sample(frac=0.1).reset_index()
79+
80+
81+
# data preprocessing using NLP : nltk
82+
df['tweet'] = df['tweet'].astype(str)
83+
#remove leading and ending whitespaces
84+
df['tweet'] = df['tweet'].str.strip()
85+
86+
# keep only alphabets
87+
df['tweet'] = df['tweet'].apply(keep_alpha)
88+
89+
# nlp preprocessing to remove stopwords and get base/stem form of each word
90+
df['tweet'] = df['tweet'].apply(nlp_preprocessing)
91+
print(df.head(2))
92+
print(df.tail(2))
93+
94+
95+
print(df['depressed'].value_counts())
96+
97+
# df.to_csv("data//preprocessed_input_data.csv", index=False)
98+
df.to_csv(PREPROCESSED_INPUT_TRAINING_DATA, index=False)
99+
100+
import pandas as pd
101+
from sklearn import metrics
102+
from sklearn.metrics import confusion_matrix
103+
from sklearn.metrics import accuracy_score
104+
from sklearn.metrics import plot_confusion_matrix
105+
106+
107+
# to save or to load model
108+
import joblib
109+
110+
svmout=0
111+
lrout=0
112+
dtout=0
113+
114+
## SVM
115+
def train_svm(X_train, X_test, y_train, y_test):
116+
117+
from sklearn import svm
118+
svm = svm.SVC(C=1000)
119+
120+
# training svm model
121+
svm.fit(X_train, y_train)
122+
123+
print("\n\n----SVM------")
124+
y_pred = svm.predict(X_test)
125+
print("Confusion matrix SVM:\n", confusion_matrix(y_test, y_pred))
126+
127+
plot_confusion_matrix(svm, X_test, y_test)
128+
plt.show()
129+
130+
svmout=round((accuracy_score(y_test, y_pred) * 100),2)
131+
# calculate the accuracy
132+
print("Accuracy score for SVM: ", round((accuracy_score(y_test, y_pred) * 100),2))
133+
134+
return svm, svmout
135+
136+
137+
138+
## Logistic regression
139+
def train_logistic_regression(X_train, X_test, y_train, y_test):
140+
from sklearn.linear_model import LogisticRegression
141+
# Create an instance of the model.
142+
logreg = LogisticRegression()
143+
# Training the model.
144+
logreg.fit(X_train,y_train)
145+
146+
#Do prediction.
147+
y_pred=logreg.predict(X_test)
148+
149+
print("\n\n-----------Logistic Regression-----")
150+
print("Confusion matrix Logistic Regression:\n",confusion_matrix(y_test, y_pred))
151+
152+
plot_confusion_matrix(logreg, X_test, y_test)
153+
plt.show()
154+
155+
lrout=round((accuracy_score(y_test, y_pred) * 100),2)
156+
# calculate the accuracy
157+
print("Accuracy score for Logistic regression: ", round((accuracy_score(y_test, y_pred) * 100),2))
158+
return logreg, lrout
159+
160+
161+
162+
## Decision Tree
163+
def train_decision_tree(X_train, X_test, y_train, y_test):
164+
from sklearn.tree import DecisionTreeClassifier
165+
model = DecisionTreeClassifier()
166+
model.fit(X_train, y_train)
167+
168+
169+
y_pred = model.predict(X_test)
170+
print("\n\n--------Decision Tree------------")
171+
print("Confusion matrix Decision Tree:\n",confusion_matrix(y_test, y_pred))
172+
173+
plot_confusion_matrix(model, X_test, y_test)
174+
plt.show()
175+
176+
dtout= round((accuracy_score(y_test, y_pred) * 100),2)
177+
print("Accuracy score for Decision Tree: ", round((accuracy_score(y_test, y_pred) * 100),2))
178+
return model, dtout
179+
180+
181+
182+
183+
184+
# training ML Model
185+
# df = pd.read_csv("data//preprocessed_input_data.csv")
186+
df = pd.read_csv(PREPROCESSED_INPUT_TRAINING_DATA)
187+
df = df.dropna()
188+
print(df.head())
189+
190+
# training the vectorizer (conveet text data to number data)
191+
from sklearn.feature_extraction.text import TfidfVectorizer
192+
vectorizer = TfidfVectorizer()
193+
X = vectorizer.fit_transform(df['tweet'].values )
194+
y = df['depressed'].values
195+
196+
#save vectorizer object to vectorize user tweets later
197+
# joblib.dump(vectorizer, 'vectorizer.pkl')
198+
joblib.dump(vectorizer, VECTORIZER_FILE)
199+
200+
201+
# train test split
202+
from sklearn.model_selection import train_test_split
203+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
204+
205+
206+
207+
# checking accruacy of SVM
208+
svm_model, svmout = train_svm(X_train, X_test, y_train, y_test)
209+
210+
# checking accuracy of Logistic Regression
211+
lr_model, lrout = train_logistic_regression(X_train, X_test, y_train, y_test)
212+
213+
# checking accuracy of Decision Tree Algorithm
214+
dt_model, dtout = train_decision_tree(X_train, X_test, y_train, y_test)
215+
216+
217+
218+
#PLOTING
219+
220+
# data = {'SVM':svmout, 'Logistic Regression':lrout, 'Decision Tree':dtout}
221+
# courses = list(data.keys())
222+
# values = list(data.values())
223+
224+
# fig = plt.figure(figsize = (10, 5))
225+
# # creating the bar plot
226+
# plt.bar(courses, values, color ='maroon',
227+
# width = 0.4)
228+
# plt.xlabel("Tweet")
229+
# plt.ylabel("No. of tweets")
230+
# plt.title("Depression Analysis")
231+
# plt.show()
232+
233+
x = ['SVM', 'Logistic Regression', 'Decision Tree']
234+
y = [svmout, lrout, dtout]
235+
color = ['red', 'blue', 'green']
236+
bars = plt.bar(x, height=y, color=color, width=.5)
237+
xlocs, xlabs = plt.xticks()
238+
# reference x so you don't need to change the range each time x changes
239+
xlocs=[i for i in x]
240+
xlabs=[i for i in x]
241+
plt.xlabel('Model')
242+
plt.ylabel('Accuracy %')
243+
plt.xticks(xlocs, xlabs)
244+
plt.title("Depression Analysis")
245+
246+
print("\n\n")
247+
for bar in bars:
248+
yval = bar.get_height()
249+
plt.text(bar.get_x(), yval + .5, yval)
250+
251+
plt.figure(figsize=(15, 15))
252+
plt.show()
253+
254+
print("\n\n")
255+
256+
257+
# choose SVM Regression based on high accuracy score
258+
model, accuracy_final = train_svm(X_train, X_test, y_train, y_test)
259+
260+
261+
# Save the model as a pickle in a file at given location "model.pkl"
262+
#joblib.dump(model, 'model.pkl')
263+
joblib.dump(model, MODEL_FILE)
264+
265+
266+
# Load/Read the model from the file at given location "model.pkl"
267+
# classification_model = joblib.load('model.pkl')
268+
classification_model = joblib.load(MODEL_FILE)
269+
270+
# predicting the model on test data
271+
y_pred=classification_model.predict(X_test)
272+
273+
# calculate the accuracy
274+
print("\n\n Model accuracy: ", round((accuracy_score(y_test, y_pred) * 100), 2))
275+
276+
print("\n\n", confusion_matrix(y_test, y_pred))
277+
278+
#PREDICT TWEETS
279+
# twitter dataset scraping based on keyword
280+
281+
import re
282+
import numpy as np
283+
import tweepy
284+
from tweepy import OAuthHandler
285+
from textblob import TextBlob
286+
287+
import pandas as pd
288+
from wordcloud import WordCloud
289+
from better_profanity import profanity
290+
import configparser
291+
292+
import joblib
293+
294+
def download_user_tweets():
295+
# set twitter credentials
296+
api_key = '850YrbsNWucByH06cbS6Ao6xy'
297+
api_key_secret = 'lYmg13QM5MzjaN0xs0IxR12B1FvLDCLc2rnQ2cXfgScETEWWp5'
298+
access_token = '1580202146255220736-Nca2hdTr9quLSwjh3QIOZSEjyQ8p9a'
299+
access_token_secret = 'ivF4kpe9VZSy1Os68iF3mJm3TvXjHpHB2o8dYcIc1MTM9'
300+
301+
# Access Twitter Data (login to twitter via api)
302+
auth = tweepy.OAuthHandler(api_key, api_key_secret)
303+
auth.set_access_token(access_token, access_token_secret)
304+
api = tweepy.API(auth)
305+
306+
# read configs
307+
# config = configparser.ConfigParser()
308+
# config.read('config.ini')
309+
# consumer_key = config['twitter']['api_key']
310+
# consumer_secret = config['twitter']['api_key_secret']
311+
# access_token = config['twitter']['access_token']
312+
# access_token_secret = config['twitter']['access_token_secret']
313+
# authentication
314+
# auth = tweepy.OAuthHandler(api_key, api_key_secret)
315+
# auth.set_access_token(access_token, access_token_secret)
316+
# api = tweepy.API(auth)
317+
318+
# user tweets
319+
user = input("Enter Twitter username:").strip()
320+
if len(user)<=1:
321+
user = 'elonmusk'
322+
limit=50
323+
324+
tweets = tweepy.Cursor(api.user_timeline, screen_name=user, count=200, tweet_mode='extended').items(limit)
325+
326+
# tweets = api.user_timeline(screen_name=user, count=limit, tweet_mode='extended')
327+
328+
# create DataFrame
329+
columns = ['User', 'tweet']
330+
data = []
331+
332+
for tweet in tweets:
333+
data.append([tweet.user.screen_name, tweet.full_text])
334+
335+
df = pd.DataFrame(data, columns=columns)
336+
337+
# print(df.head())
338+
# print("\n\n")
339+
# save user tweets to csv
340+
# print("LOGGER: saving user tweets to : ", USER_TWEET_DATA_FILE)
341+
df.to_csv(USER_TWEET_DATA_FILE, index=False)
342+
343+
return df
344+
345+
346+
def predict_user_tweets(df):
347+
348+
# user tweet preprocessing using NLP : nltk
349+
350+
df['tweet'] = df['tweet'].astype(str)
351+
#remove leading and ending whitespaces
352+
df['tweet'] = df['tweet'].str.strip()
353+
354+
# keep only alphabets
355+
df['tweet'] = df['tweet'].apply(keep_alpha)
356+
357+
# nlp preprocessing to remove stopwords and get base/stem form of each word
358+
df['tweet'] = df['tweet'].apply(nlp_preprocessing)
359+
df['tweet'] = df['tweet'].str.strip()
360+
361+
# replace empty rows with NAN and then drop them
362+
df['tweet'].replace('', np.nan, inplace=True)
363+
df = df.dropna()
364+
df = df.reset_index(drop=True)
365+
366+
vectorizer = joblib.load(VECTORIZER_FILE)
367+
X_test = vectorizer.transform(df['tweet'].values )
368+
369+
# Load/Read the model from the file at given location "model.pkl"
370+
# classification_model = joblib.load('model.pkl')
371+
classification_model = joblib.load(MODEL_FILE)
372+
373+
# predicting the model on user test data
374+
y_pred=classification_model.predict(X_test)
375+
376+
# print(y_pred)
377+
df['prediction'] = y_pred
378+
print(df[['tweet', 'prediction']])
379+
return list(y_pred)
380+
381+
382+
def final_output(predictions):
383+
total = len(predictions)
384+
depressed_count = predictions.count("YES")
385+
386+
print("\n\n")
387+
388+
if depressed_count > (total*.6):
389+
print("Result: DEPRESSED 😒")
390+
else:
391+
print("Result: NOT DEPRESSED 😊")
392+
393+
user_tweets = download_user_tweets()
394+
predictions = predict_user_tweets(user_tweets)
395+
final_output(predictions)

0 commit comments

Comments
 (0)