1+ # -*- coding: utf-8 -*-
2+ """twitterapiaccount.ipynb
3+
4+ Automatically generated by Colaboratory.
5+
6+ Original file is located at
7+ https://colab.research.google.com/drive/1q7pmP3GtNueW7iA4mVutlMdT7BcFZKJR
8+ """
9+
10+ from google .colab import drive
11+ drive .mount ('/content/drive' )
12+
13+ !pip install better_profanity
14+ import nltk
15+
16+ nltk .download ('punkt' )
17+ nltk .download ('stopwords' )
18+ nltk .download ('wordnet' )
19+ nltk .download ('omw-1.4' )
20+
21+
22+
23+ RAW_INPUT_TRAINING_DATA = "/content/drive/MyDrive/twitterapiaccount/dataset_combined_2510_new.csv"
24+
25+
26+
27+ USER_TWEET_DATA_FILE = "user_tweets.csv"
28+ PREPROCESSED_INPUT_TRAINING_DATA = "preprocessed_input_data.csv"
29+ MODEL_FILE = "model.pkl"
30+ VECTORIZER_FILE = "vectorizer.pkl"
31+
32+ # Commented out IPython magic to ensure Python compatibility.
33+
34+ #DATA CLEANING: Vectorizer AND NLP
35+
36+ import pandas as pd
37+ import numpy as np
38+ import matplotlib .pyplot as plt
39+ # %matplotlib inline
40+
41+ import re
42+ from sklearn .feature_extraction .text import TfidfVectorizer
43+
44+ #importing nlp packages
45+ from nltk import stem
46+ from nltk .corpus import stopwords
47+ stemmer = stem .SnowballStemmer ('english' )
48+ stopwords = set (stopwords .words ('english' ))
49+
50+
51+ #removing the special characters and numbers and url
52+ def keep_alpha (s ):
53+ # s = row['content']
54+ non_url = re .sub (r"http\S+" , "" , s )
55+ res = re .sub ('[^a-zA-Z\s]' , '' , non_url )
56+ res1 = re .sub ('\n ' , '' , res )
57+ return res1
58+
59+ def nlp_preprocessing (msg ):
60+ try :
61+ # converting messages to lowercase
62+ msg = msg .lower ()
63+ # removing stopwords
64+ msg = [word for word in msg .split () if word not in stopwords ]
65+ # using a stemmer (getting root form of each word of each row)
66+ msg = " " .join ([stemmer .stem (word ) for word in msg ])
67+
68+ except Exception as e :
69+ print (e )
70+
71+ return msg
72+
73+
74+ df = pd .read_csv (RAW_INPUT_TRAINING_DATA )
75+ df .rename (columns = {'Text' :'tweet' }, inplace = True )
76+ df = df .dropna ()
77+ df = df .sample (frac = 1 ).reset_index ()
78+ #df = df.sample(frac=0.1).reset_index()
79+
80+
81+ # data preprocessing using NLP : nltk
82+ df ['tweet' ] = df ['tweet' ].astype (str )
83+ #remove leading and ending whitespaces
84+ df ['tweet' ] = df ['tweet' ].str .strip ()
85+
86+ # keep only alphabets
87+ df ['tweet' ] = df ['tweet' ].apply (keep_alpha )
88+
89+ # nlp preprocessing to remove stopwords and get base/stem form of each word
90+ df ['tweet' ] = df ['tweet' ].apply (nlp_preprocessing )
91+ print (df .head (2 ))
92+ print (df .tail (2 ))
93+
94+
95+ print (df ['depressed' ].value_counts ())
96+
97+ # df.to_csv("data//preprocessed_input_data.csv", index=False)
98+ df .to_csv (PREPROCESSED_INPUT_TRAINING_DATA , index = False )
99+
100+ import pandas as pd
101+ from sklearn import metrics
102+ from sklearn .metrics import confusion_matrix
103+ from sklearn .metrics import accuracy_score
104+ from sklearn .metrics import plot_confusion_matrix
105+
106+
107+ # to save or to load model
108+ import joblib
109+
110+ svmout = 0
111+ lrout = 0
112+ dtout = 0
113+
114+ ## SVM
115+ def train_svm (X_train , X_test , y_train , y_test ):
116+
117+ from sklearn import svm
118+ svm = svm .SVC (C = 1000 )
119+
120+ # training svm model
121+ svm .fit (X_train , y_train )
122+
123+ print ("\n \n ----SVM------" )
124+ y_pred = svm .predict (X_test )
125+ print ("Confusion matrix SVM:\n " , confusion_matrix (y_test , y_pred ))
126+
127+ plot_confusion_matrix (svm , X_test , y_test )
128+ plt .show ()
129+
130+ svmout = round ((accuracy_score (y_test , y_pred ) * 100 ),2 )
131+ # calculate the accuracy
132+ print ("Accuracy score for SVM: " , round ((accuracy_score (y_test , y_pred ) * 100 ),2 ))
133+
134+ return svm , svmout
135+
136+
137+
138+ ## Logistic regression
139+ def train_logistic_regression (X_train , X_test , y_train , y_test ):
140+ from sklearn .linear_model import LogisticRegression
141+ # Create an instance of the model.
142+ logreg = LogisticRegression ()
143+ # Training the model.
144+ logreg .fit (X_train ,y_train )
145+
146+ #Do prediction.
147+ y_pred = logreg .predict (X_test )
148+
149+ print ("\n \n -----------Logistic Regression-----" )
150+ print ("Confusion matrix Logistic Regression:\n " ,confusion_matrix (y_test , y_pred ))
151+
152+ plot_confusion_matrix (logreg , X_test , y_test )
153+ plt .show ()
154+
155+ lrout = round ((accuracy_score (y_test , y_pred ) * 100 ),2 )
156+ # calculate the accuracy
157+ print ("Accuracy score for Logistic regression: " , round ((accuracy_score (y_test , y_pred ) * 100 ),2 ))
158+ return logreg , lrout
159+
160+
161+
162+ ## Decision Tree
163+ def train_decision_tree (X_train , X_test , y_train , y_test ):
164+ from sklearn .tree import DecisionTreeClassifier
165+ model = DecisionTreeClassifier ()
166+ model .fit (X_train , y_train )
167+
168+
169+ y_pred = model .predict (X_test )
170+ print ("\n \n --------Decision Tree------------" )
171+ print ("Confusion matrix Decision Tree:\n " ,confusion_matrix (y_test , y_pred ))
172+
173+ plot_confusion_matrix (model , X_test , y_test )
174+ plt .show ()
175+
176+ dtout = round ((accuracy_score (y_test , y_pred ) * 100 ),2 )
177+ print ("Accuracy score for Decision Tree: " , round ((accuracy_score (y_test , y_pred ) * 100 ),2 ))
178+ return model , dtout
179+
180+
181+
182+
183+
184+ # training ML Model
185+ # df = pd.read_csv("data//preprocessed_input_data.csv")
186+ df = pd .read_csv (PREPROCESSED_INPUT_TRAINING_DATA )
187+ df = df .dropna ()
188+ print (df .head ())
189+
190+ # training the vectorizer (conveet text data to number data)
191+ from sklearn .feature_extraction .text import TfidfVectorizer
192+ vectorizer = TfidfVectorizer ()
193+ X = vectorizer .fit_transform (df ['tweet' ].values )
194+ y = df ['depressed' ].values
195+
196+ #save vectorizer object to vectorize user tweets later
197+ # joblib.dump(vectorizer, 'vectorizer.pkl')
198+ joblib .dump (vectorizer , VECTORIZER_FILE )
199+
200+
201+ # train test split
202+ from sklearn .model_selection import train_test_split
203+ X_train , X_test , y_train , y_test = train_test_split (X , y , test_size = 0.2 , random_state = 42 )
204+
205+
206+
207+ # checking accruacy of SVM
208+ svm_model , svmout = train_svm (X_train , X_test , y_train , y_test )
209+
210+ # checking accuracy of Logistic Regression
211+ lr_model , lrout = train_logistic_regression (X_train , X_test , y_train , y_test )
212+
213+ # checking accuracy of Decision Tree Algorithm
214+ dt_model , dtout = train_decision_tree (X_train , X_test , y_train , y_test )
215+
216+
217+
218+ #PLOTING
219+
220+ # data = {'SVM':svmout, 'Logistic Regression':lrout, 'Decision Tree':dtout}
221+ # courses = list(data.keys())
222+ # values = list(data.values())
223+
224+ # fig = plt.figure(figsize = (10, 5))
225+ # # creating the bar plot
226+ # plt.bar(courses, values, color ='maroon',
227+ # width = 0.4)
228+ # plt.xlabel("Tweet")
229+ # plt.ylabel("No. of tweets")
230+ # plt.title("Depression Analysis")
231+ # plt.show()
232+
233+ x = ['SVM' , 'Logistic Regression' , 'Decision Tree' ]
234+ y = [svmout , lrout , dtout ]
235+ color = ['red' , 'blue' , 'green' ]
236+ bars = plt .bar (x , height = y , color = color , width = .5 )
237+ xlocs , xlabs = plt .xticks ()
238+ # reference x so you don't need to change the range each time x changes
239+ xlocs = [i for i in x ]
240+ xlabs = [i for i in x ]
241+ plt .xlabel ('Model' )
242+ plt .ylabel ('Accuracy %' )
243+ plt .xticks (xlocs , xlabs )
244+ plt .title ("Depression Analysis" )
245+
246+ print ("\n \n " )
247+ for bar in bars :
248+ yval = bar .get_height ()
249+ plt .text (bar .get_x (), yval + .5 , yval )
250+
251+ plt .figure (figsize = (15 , 15 ))
252+ plt .show ()
253+
254+ print ("\n \n " )
255+
256+
257+ # choose SVM Regression based on high accuracy score
258+ model , accuracy_final = train_svm (X_train , X_test , y_train , y_test )
259+
260+
261+ # Save the model as a pickle in a file at given location "model.pkl"
262+ #joblib.dump(model, 'model.pkl')
263+ joblib .dump (model , MODEL_FILE )
264+
265+
266+ # Load/Read the model from the file at given location "model.pkl"
267+ # classification_model = joblib.load('model.pkl')
268+ classification_model = joblib .load (MODEL_FILE )
269+
270+ # predicting the model on test data
271+ y_pred = classification_model .predict (X_test )
272+
273+ # calculate the accuracy
274+ print ("\n \n Model accuracy: " , round ((accuracy_score (y_test , y_pred ) * 100 ), 2 ))
275+
276+ print ("\n \n " , confusion_matrix (y_test , y_pred ))
277+
278+ #PREDICT TWEETS
279+ # twitter dataset scraping based on keyword
280+
281+ import re
282+ import numpy as np
283+ import tweepy
284+ from tweepy import OAuthHandler
285+ from textblob import TextBlob
286+
287+ import pandas as pd
288+ from wordcloud import WordCloud
289+ from better_profanity import profanity
290+ import configparser
291+
292+ import joblib
293+
294+ def download_user_tweets ():
295+ # set twitter credentials
296+ api_key = '850YrbsNWucByH06cbS6Ao6xy'
297+ api_key_secret = 'lYmg13QM5MzjaN0xs0IxR12B1FvLDCLc2rnQ2cXfgScETEWWp5'
298+ access_token = '1580202146255220736-Nca2hdTr9quLSwjh3QIOZSEjyQ8p9a'
299+ access_token_secret = 'ivF4kpe9VZSy1Os68iF3mJm3TvXjHpHB2o8dYcIc1MTM9'
300+
301+ # Access Twitter Data (login to twitter via api)
302+ auth = tweepy .OAuthHandler (api_key , api_key_secret )
303+ auth .set_access_token (access_token , access_token_secret )
304+ api = tweepy .API (auth )
305+
306+ # read configs
307+ # config = configparser.ConfigParser()
308+ # config.read('config.ini')
309+ # consumer_key = config['twitter']['api_key']
310+ # consumer_secret = config['twitter']['api_key_secret']
311+ # access_token = config['twitter']['access_token']
312+ # access_token_secret = config['twitter']['access_token_secret']
313+ # authentication
314+ # auth = tweepy.OAuthHandler(api_key, api_key_secret)
315+ # auth.set_access_token(access_token, access_token_secret)
316+ # api = tweepy.API(auth)
317+
318+ # user tweets
319+ user = input ("Enter Twitter username:" ).strip ()
320+ if len (user )<= 1 :
321+ user = 'elonmusk'
322+ limit = 50
323+
324+ tweets = tweepy .Cursor (api .user_timeline , screen_name = user , count = 200 , tweet_mode = 'extended' ).items (limit )
325+
326+ # tweets = api.user_timeline(screen_name=user, count=limit, tweet_mode='extended')
327+
328+ # create DataFrame
329+ columns = ['User' , 'tweet' ]
330+ data = []
331+
332+ for tweet in tweets :
333+ data .append ([tweet .user .screen_name , tweet .full_text ])
334+
335+ df = pd .DataFrame (data , columns = columns )
336+
337+ # print(df.head())
338+ # print("\n\n")
339+ # save user tweets to csv
340+ # print("LOGGER: saving user tweets to : ", USER_TWEET_DATA_FILE)
341+ df .to_csv (USER_TWEET_DATA_FILE , index = False )
342+
343+ return df
344+
345+
346+ def predict_user_tweets (df ):
347+
348+ # user tweet preprocessing using NLP : nltk
349+
350+ df ['tweet' ] = df ['tweet' ].astype (str )
351+ #remove leading and ending whitespaces
352+ df ['tweet' ] = df ['tweet' ].str .strip ()
353+
354+ # keep only alphabets
355+ df ['tweet' ] = df ['tweet' ].apply (keep_alpha )
356+
357+ # nlp preprocessing to remove stopwords and get base/stem form of each word
358+ df ['tweet' ] = df ['tweet' ].apply (nlp_preprocessing )
359+ df ['tweet' ] = df ['tweet' ].str .strip ()
360+
361+ # replace empty rows with NAN and then drop them
362+ df ['tweet' ].replace ('' , np .nan , inplace = True )
363+ df = df .dropna ()
364+ df = df .reset_index (drop = True )
365+
366+ vectorizer = joblib .load (VECTORIZER_FILE )
367+ X_test = vectorizer .transform (df ['tweet' ].values )
368+
369+ # Load/Read the model from the file at given location "model.pkl"
370+ # classification_model = joblib.load('model.pkl')
371+ classification_model = joblib .load (MODEL_FILE )
372+
373+ # predicting the model on user test data
374+ y_pred = classification_model .predict (X_test )
375+
376+ # print(y_pred)
377+ df ['prediction' ] = y_pred
378+ print (df [['tweet' , 'prediction' ]])
379+ return list (y_pred )
380+
381+
382+ def final_output (predictions ):
383+ total = len (predictions )
384+ depressed_count = predictions .count ("YES" )
385+
386+ print ("\n \n " )
387+
388+ if depressed_count > (total * .6 ):
389+ print ("Result: DEPRESSED 😒" )
390+ else :
391+ print ("Result: NOT DEPRESSED 😊" )
392+
393+ user_tweets = download_user_tweets ()
394+ predictions = predict_user_tweets (user_tweets )
395+ final_output (predictions )
0 commit comments