|
1 | | -# TODO: Fix |
| 1 | +import os |
2 | 2 | import re |
3 | 3 |
|
4 | 4 | import numpy as np # linear algebra |
5 | 5 | import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) |
| 6 | +from sklearn.feature_extraction.text import TfidfVectorizer |
| 7 | +from sklearn.model_selection import train_test_split |
6 | 8 |
|
7 | | -train = pd.read_csv("/kaggle/input/train.csv") |
8 | | -test = pd.read_csv("/kaggle/input/test.csv") |
9 | | -submission = pd.read_csv("/kaggle/input/sample_submission.csv") |
10 | 9 |
|
| 10 | +def preprocess_script(): |
| 11 | + """ |
| 12 | + This method applies the preprocessing steps to the training, validation, and test datasets. |
| 13 | + """ |
| 14 | + if os.path.exists("/kaggle/input/X_train.pkl"): |
| 15 | + X_train = pd.read_pickle("/kaggle/input/X_train.pkl") |
| 16 | + X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl") |
| 17 | + y_train = pd.read_pickle("/kaggle/input/y_train.pkl") |
| 18 | + y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl") |
| 19 | + X_test = pd.read_pickle("/kaggle/input/X_test.pkl") |
| 20 | + others = pd.read_pickle("/kaggle/input/others.pkl") |
11 | 21 |
|
12 | | -features = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"] |
13 | | -target = train[features] |
| 22 | + return X_train, X_valid, y_train, y_valid, X_test, *others |
14 | 23 |
|
| 24 | + def data_cleaner(text): |
| 25 | + text = text.strip() |
| 26 | + text = re.sub(r"\n", "", text) |
| 27 | + text = text.lower() |
| 28 | + return text |
15 | 29 |
|
16 | | -text_train = train["full_text"] |
17 | | -text_test = test["full_text"] |
| 30 | + # train |
| 31 | + train = pd.read_csv("/kaggle/input/train.csv") |
| 32 | + test = pd.read_csv("/kaggle/input/test.csv") |
18 | 33 |
|
19 | | -text = pd.concat([text_train, text_test], ignore_index=True) |
| 34 | + train["full_text"] = train["full_text"].apply(data_cleaner) |
| 35 | + test["full_text"] = test["full_text"].apply(data_cleaner) |
20 | 36 |
|
| 37 | + y_train = train[["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]] |
21 | 38 |
|
22 | | -count_words = text.str.findall(r"(\w+)").str.len() |
23 | | -print(count_words.sum()) |
| 39 | + vectorizer = TfidfVectorizer() |
| 40 | + X_train = vectorizer.fit_transform(train["full_text"]) |
| 41 | + X_test = vectorizer.transform(test["full_text"]) |
24 | 42 |
|
| 43 | + X_train = pd.DataFrame.sparse.from_spmatrix(X_train) |
| 44 | + X_test = pd.DataFrame.sparse.from_spmatrix(X_test) |
25 | 45 |
|
26 | | -""" Cleaning Text """ |
27 | | -text = text.str.lower() |
| 46 | + X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42) |
28 | 47 |
|
29 | | -# removing special characters and numbers |
30 | | -text = text.apply(lambda x: re.sub("[^a-z]\s", "", x)) |
31 | | - |
32 | | -# remove hash tags |
33 | | -text = text.str.replace("#", "") |
34 | | - |
35 | | -# remove words less than 3 character and greater than 7 |
36 | | -text = text.apply(lambda x: " ".join([w for w in x.split() if len(w) > 2 and len(w) < 8])) |
37 | | - |
38 | | -# removing stopwords |
39 | | -# text = text.apply(lambda x : " ".join(word for word in x.split() if word not in stopwords )) |
40 | | - |
41 | | -count_words = text.str.findall(r"(\w+)").str.len() |
42 | | -print(count_words.sum()) |
43 | | - |
44 | | - |
45 | | -most_freq_words = pd.Series(" ".join(text).lower().split()).value_counts()[:25] |
46 | | -text = text.apply(lambda x: " ".join(word for word in x.split() if word not in most_freq_words)) |
47 | | - |
48 | | -count_words = text.str.findall(r"(\w+)").str.len() |
49 | | - |
50 | | -apostrophe_dict = { |
51 | | - "ain't": "am not / are not", |
52 | | - "aren't": "are not / am not", |
53 | | - "can't": "cannot", |
54 | | - "can't've": "cannot have", |
55 | | - "'cause": "because", |
56 | | - "could've": "could have", |
57 | | - "couldn't": "could not", |
58 | | - "couldn't've": "could not have", |
59 | | - "didn't": "did not", |
60 | | - "doesn't": "does not", |
61 | | - "don't": "do not", |
62 | | - "hadn't": "had not", |
63 | | - "hadn't've": "had not have", |
64 | | - "hasn't": "has not", |
65 | | - "haven't": "have not", |
66 | | - "he'd": "he had / he would", |
67 | | - "he'd've": "he would have", |
68 | | - "he'll": "he shall / he will", |
69 | | - "he'll've": "he shall have / he will have", |
70 | | - "he's": "he has / he is", |
71 | | - "how'd": "how did", |
72 | | - "how'd'y": "how do you", |
73 | | - "how'll": "how will", |
74 | | - "how's": "how has / how is", |
75 | | - "i'd": "I had / I would", |
76 | | - "i'd've": "I would have", |
77 | | - "i'll": "I shall / I will", |
78 | | - "i'll've": "I shall have / I will have", |
79 | | - "i'm": "I am", |
80 | | - "i've": "I have", |
81 | | - "isn't": "is not", |
82 | | - "it'd": "it had / it would", |
83 | | - "it'd've": "it would have", |
84 | | - "it'll": "it shall / it will", |
85 | | - "it'll've": "it shall have / it will have", |
86 | | - "it's": "it has / it is", |
87 | | - "let's": "let us", |
88 | | - "ma'am": "madam", |
89 | | - "mayn't": "may not", |
90 | | - "might've": "might have", |
91 | | - "mightn't": "might not", |
92 | | - "mightn't've": "might not have", |
93 | | - "must've": "must have", |
94 | | - "mustn't": "must not", |
95 | | - "mustn't've": "must not have", |
96 | | - "needn't": "need not", |
97 | | - "needn't've": "need not have", |
98 | | - "o'clock": "of the clock", |
99 | | - "oughtn't": "ought not", |
100 | | - "oughtn't've": "ought not have", |
101 | | - "shan't": "shall not", |
102 | | - "sha'n't": "shall not", |
103 | | - "shan't've": "shall not have", |
104 | | - "she'd": "she had / she would", |
105 | | - "she'd've": "she would have", |
106 | | - "she'll": "she shall / she will", |
107 | | - "she'll've": "she shall have / she will have", |
108 | | - "she's": "she has / she is", |
109 | | - "should've": "should have", |
110 | | - "shouldn't": "should not", |
111 | | - "shouldn't've": "should not have", |
112 | | - "so've": "so have", |
113 | | - "so's": "so as / so is", |
114 | | - "that'd": "that would / that had", |
115 | | - "that'd've": "that would have", |
116 | | - "that's": "that has / that is", |
117 | | - "there'd": "there had / there would", |
118 | | - "there'd've": "there would have", |
119 | | - "there's": "there has / there is", |
120 | | - "they'd": "they had / they would", |
121 | | - "they'd've": "they would have", |
122 | | - "they'll": "they shall / they will", |
123 | | - "they'll've": "they shall have / they will have", |
124 | | - "they're": "they are", |
125 | | - "they've": "they have", |
126 | | - "to've": "to have", |
127 | | - "wasn't": "was not", |
128 | | - "we'd": "we had / we would", |
129 | | - "we'd've": "we would have", |
130 | | - "we'll": "we will", |
131 | | - "we'll've": "we will have", |
132 | | - "we're": "we are", |
133 | | - "we've": "we have", |
134 | | - "weren't": "were not", |
135 | | - "what'll": "what shall / what will", |
136 | | - "what'll've": "what shall have / what will have", |
137 | | - "what're": "what are", |
138 | | - "what's": "what has / what is", |
139 | | - "what've": "what have", |
140 | | - "when's": "when has / when is", |
141 | | - "when've": "when have", |
142 | | - "where'd": "where did", |
143 | | - "where's": "where has / where is", |
144 | | - "where've": "where have", |
145 | | - "who'll": "who shall / who will", |
146 | | - "who'll've": "who shall have / who will have", |
147 | | - "who's": "who has / who is", |
148 | | - "who've": "who have", |
149 | | - "why's": "why has / why is", |
150 | | - "why've": "why have", |
151 | | - "will've": "will have", |
152 | | - "won't": "will not", |
153 | | - "won't've": "will not have", |
154 | | - "would've": "would have", |
155 | | - "wouldn't": "would not", |
156 | | - "wouldn't've": "would not have", |
157 | | - "y'all": "you all", |
158 | | - "y'all'd": "you all would", |
159 | | - "y'all'd've": "you all would have", |
160 | | - "y'all're": "you all are", |
161 | | - "y'all've": "you all have", |
162 | | - "you'd": "you had / you would", |
163 | | - "you'd've": "you would have", |
164 | | - "you'll": "you shall / you will", |
165 | | - "you'll've": "you shall have / you will have", |
166 | | - "you're": "you are", |
167 | | - "you've": "you have", |
168 | | -} |
169 | | - |
170 | | - |
171 | | -def lookup_dict(txt, dictionary): |
172 | | - for word in txt.split(): |
173 | | - if word.lower() in dictionary: |
174 | | - if word.lower() in txt.split(): |
175 | | - txt = txt.replace(word, dictionary[word.lower()]) |
176 | | - return txt |
177 | | - |
178 | | - |
179 | | -text = text.apply(lambda x: lookup_dict(x, apostrophe_dict)) |
180 | | - |
181 | | -# Remove rare words |
182 | | -from collections import Counter |
183 | | -from itertools import chain |
184 | | - |
185 | | -# split words into lists |
186 | | -v = text.str.split().tolist() |
187 | | -# compute global word frequency |
188 | | -c = Counter(chain.from_iterable(v)) |
189 | | -# filter, join, and re-assign |
190 | | -text = [" ".join([j for j in i if c[j] > 1]) for i in v] |
191 | | -text = pd.Series(text) |
192 | | - |
193 | | -total_word = 0 |
194 | | -for x, word in enumerate(text): |
195 | | - num_word = len(word.split()) |
196 | | - # print(num_word) |
197 | | - total_word = total_word + num_word |
198 | | -print(total_word) |
| 48 | + return X_train, X_valid, y_train, y_valid, X_test |
0 commit comments