#Data Loading
imdb=pd.read_csv('imdb.csv')
imdb.columns = ["index","text","label"]
print(imdb.head(5))
-------------------------------------------------------------
data_size = imdb.shape
print(data_size)
imdb_col_names = list(imdb.columns)
print(imdb_col_names)
print(imdb.groupby('label').describe())
print(imdb.head(3))
-------------------------------------------------------------
imdb_target=imdb['label']
print(imdb_target)
-------------------------------------------------------------
from nltk.tokenize import word_tokenize
import nltk
nltk.download('all')
def split_tokens(text):
message = text.lower()
word_tokens = word_tokenize(text)
return word_tokens
imdb['tokenized_message'] = imdb.apply(lambda row:split_tokens(row['text']),axis=1)
-------------------------------------------------------------
from nltk.stem.wordnet import WordNetLemmatizer
def split_into_lemmas(text):
lemma = []
lemmatizer = WordNetLemmatizer()
for word in text:
a=lemmatizer.lemmatize(word)
lemma.append(a)
return lemma
imdb['lemmatized_message'] = imdb.apply(lambda row:
split_into_lemmas(row['tokenized_message']),axis=1)
print('Tokenized message:', imdb['tokenized_message'][55] )
print('Lemmatized message:', imdb['lemmatized_message'][55])
-------------------------------------------------------------
from nltk.corpus import stopwords
def stopword_removal(text):
stop_words = set(stopwords.words('english'))
filtered_sentence = []
filtered_sentence = ' '.join([word for word in text if word not in stop_words])
return filtered_sentence
imdb['preprocessed_message'] = imdb.apply(lambda row:
stopword_removal(row['lemmatized_message']),axis=1)
print('Preprocessed message:',imdb['preprocessed_message'])
Training_data=pd.Series(list(imdb['preprocessed_message']))
Training_label=pd.Series(list(imdb['label']))
-------------------------------------------------------------
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
training_data=pd.Series(list(imdb['preprocessed_message']))
training_label=pd.Series(list(imdb['label']))
tf_vectorizer = CountVectorizer(ngram_range=(1,2),min_df=(1/len(Training_label)),
max_df=0.7)
Total_Dictionary_TDM = tf_vectorizer.fit(Training_data)
message_data_TDM = Total_Dictionary_TDM.transform(training_data)
-------------------------------------------------------------
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer( ngram_range = (1,2), min_df =
(1/len(training_label)),max_df=0.7 )
Total_Dictionary_TFIDF = tfidf_vectorizer.fit(training_data)
message_data_TFIDF = Total_Dictionary_TFIDF.transform(training_data)
-------------------------------------------------------------
from sklearn.model_selection import train_test_split#Splitting the data for
training and testing
train_data,test_data, train_label, test_label =
train_test_split(message_data_TDM, training_label, test_size=0.1)
-------------------------------------------------------------
seed=9
from sklearn.svm import SVC
train_data_shape = train_data.shape
test_data_shape = test_data.shape
print("The shape of train data : ", train_data.shape)
print("The shape of test data : ", test_data.shape)
classifier = SVC(kernel="linear",C=0.025, random_state=seed)
classifier = classifier.fit(train_data,train_label)
#target =
score = classifier.fit(train_data,train_label)
print('SVM Classifier : ',score)
with open('output.txt', 'w') as file:
file.write(str((imdb['tokenized_message'][55],imdb['lemmatized_message'][55])))
-------------------------------------------------------------
from sklearn.linear_model import SGDClassifier
train_data,test_data, train_label, test_label = train_test_split(message_data_TDM,
training_label, test_size=0.1)
train_data_shape = train_data.shape
test_data_shape = test_data.shape
print("The shape of train data : ",train_data.shape)
print("The shape of test data : ",test_data.shape)
classifier = SGDClassifier(loss = 'modified_huber', shuffle=True,
random_state=seed)
classifier = classifier.fit(train_data,train_label)
#target=
score = classifier.score(test_data,test_label)
print('SGD classifier : ',score)
with open('output1.txt', 'w') as file:
file.write(str((imdb['preprocessed_message'][55])))
-------------------------------------------------------------