import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from sklearn.metrics import confusion_matrix
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, GRU,
Dense, Dropout, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from transformers import GPT2Tokenizer, GPT2Model
print('import done')
# Define hyperparameters
MAX_SEQ_LENGTH = 100
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
FILTER_SIZES = [3, 5, 7]
NUM_FILTERS = 256
GRU_UNITS = 256
DENSE_UNITS = 1
DROPOUT_RATE = 0.8
print('config done')
# Load the dataset
df =
pd.read_csv("/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv",
encoding='latin-1',
header=None)
df.columns = ["sentiment", "id", "date", "query", "user", "text"]
df = df[["sentiment", "text"]]
df["sentiment"] = df["sentiment"].replace({0: "negative", 4: "positive"})
texts = df["text"].values
labels = df["sentiment"].values
labels = np.array([1 if label == "positive" else 0 for label in labels])
print(df.head(10))
# Preprocess text data
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt_model = GPT2Model.from_pretrained("gpt2").to(device)
def extract_gpt_features(text):
inputs = tokenizer.encode_plus(text, add_special_tokens=True,
return_tensors="pt")
inputs = inputs.to(device)
with torch.no_grad():
outputs = gpt_model(**inputs)[0]
features = outputs[:, 0, :].squeeze(0).cpu().numpy()
return features
sequences = np.array([extract_gpt_features(text) for text in texts])
word_index = None # We don't need word index for GPT-based features
data = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH)
# Define model architecture
inputs = Input(shape=(MAX_SEQ_LENGTH,))
embedding = Embedding(input_dim=len(word_index) + 1, output_dim=EMBEDDING_DIM,
input_length=MAX_SEQ_LENGTH)(inputs)
conv_layers = []
for filter_size in FILTER_SIZES:
conv = Conv1D(filters=NUM_FILTERS, kernel_size=filter_size, activation='relu')
(embedding)
pool = MaxPooling1D(pool_size=MAX_SEQ_LENGTH - filter_size + 1)(conv)
conv_layers.append(pool)
concat = Concatenate()(conv_layers)
gru = GRU(units=GRU_UNITS)(concat)
dropout = Dropout(rate=DROPOUT_RATE)(gru)
outputs = Dense(units=DENSE_UNITS, activation='sigmoid')(dropout)
model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
model.summary()
# Train the model with early stopping
es = EarlyStopping(monitor='val_accuracy', patience=5, mode='max', min_delta=0.01,
baseline=0.85)
history = model.fit(data, labels, epochs=50, validation_split=0.3, callbacks=[es])
# Plot accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()
# Make predictions on new data
new_texts = ["is upset that he can't update his Facebook by texting it... and might
cry as a result School today ...",
"@Kenichan I dived many times for the ball. Managed to save 50% The
rest go out of bounds",
"my whole body feels itchy and like its on fire",
"@nationwideclass no, it's not behaving at all. i'm mad. why am i
here? because I can't see you all o...",
"@Kwesidei not the whole crew", "@LettyA ahh ive always wanted to see
rent love the soundtrack!!",
"@FakerPattyPattz Oh dear. Were you drinking out of the forgotten
table drinks? "]
new_sequences = np.array([extract_gpt_features(text) for text in new_texts])
new_data = pad_sequences(new_sequences, maxlen=MAX_SEQ_LENGTH)
predictions = model.predict(new_data)
# Evaluate the model
y_pred = np.round(predictions)
y_true = np.array([0, 0, 0, 0, 0, 0, 0]) # True labels of new data
cm = confusion_matrix(y_true, y_pred)
print(cm)