Wednesday, 25 September 2019

Sentiment Analysis using NLTK and Sklearn in Python

Data can be downloaded from -

http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz

Step 1 - loading required libraries

import os # to check working path
from sklearn.datasets import load_files # load_files automatically labels classes when input data is present in different folders
import re # for regular expressions
import nltk  # for nlp 
from nltk.stem import WordNetLemmatizer # to use WordNet dataset for stemming
nltk.download('wordnet')
from sklearn.feature_extraction.text import TfidfVectorizer # get tf-idf values
from sklearn.model_selection import train_test_split # to split testand train dataset
from sklearn.ensemble import RandomForestClassifier # for classification
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pickle to save model



Step 2 - loading data

movie_data = load_files("C:\\D\\Learning\\Sentiment Analysis usinf sklearn\\txt_sentoken")
X,y=movie_data.data, movie_data.target



Step 3- data preprocessing and converting into tf-idf values ( documents are converted into array of all the words ( tf-idf value of every word in every documents)
 
new_X= []
for data in X:
    data1= str(data)
    data2= re.sub(r'[^\w]', " ", data1) # replaces all special characters
    data3= re.sub(r'[\s+\W+\s]', " ", data2) # replaces all single letter word
    data4= re.sub(r'[ ][ ]+', " ", data3) # removes multiple spaces
    data5 = re.sub(r'^b\s+', '', data4) #  removes leading b
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', data5) # removes single letter
    document_splitted= document.lower()
    document_splitted= document.split() # stemming has to be done on strings
    stemmer = WordNetLemmatizer()
    stemmed_doc= [stemmer.lemmatize(word) for word in document_splitted]
    stemmed_str= " ".join(stemmed_doc) # converting list back to str
    new_X.append(stemmed_str) # creating list of documents
vectorizer = TfidfVectorizer()
X= vectorizer.fit_transform(new_X)
X_arr= X.toarray()     



Step 4- Getting train and test set and fitting classification 

X_train, X_test, y_train, y_test = train_test_split(X_arr, y, test_size= .2)
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train)



Step 5- model Evaluation-

# model evaluation on train data
y_predicted= classifier.predict(X_train)
cf= confusion_matrix(y_train, y_predicted)

print(classification_report(y_train, y_predicted)) 
# model evaluation on test data
y_test_predicted= classifier.predict(X_test)
print(confusion_matrix(y_test, y_test_predicted))

print(classification_report(y_test, y_test_predicted))



Step 6- storing and loading model again-

with open('text_classifier', 'wb') as picklefile: 
pickle.dump(classifier,picklefile)
with open('text_classifier', 'rb') as mfile:
model= pickle.load(mfile)



Step 7- test on new document

file1 = open("nerw_review.txt","r")
data_file= file1.readlines()

X1= vectorizer.transform(data_file) # vectorizer.transform is used to convert new doc into tf-idf
predict_review= classifier.predict(X1)

predict_review.view()

No comments:

Post a Comment