Skip to content

Commit a288e39

Browse files
authored
feat: add kaggle tpl: feedback-prize (#331)
* change feedback tpl * feedback tpl changes * fix feedback tpl * fix train.py of feedback tpl * add rf model for feedback tpl * fix CI
1 parent 034f238 commit a288e39

File tree

7 files changed

+211
-311
lines changed

7 files changed

+211
-311
lines changed
Lines changed: 33 additions & 183 deletions
Original file line numberDiff line numberDiff line change
@@ -1,198 +1,48 @@
1-
# TODO: Fix
1+
import os
22
import re
33

44
import numpy as np # linear algebra
55
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
6+
from sklearn.feature_extraction.text import TfidfVectorizer
7+
from sklearn.model_selection import train_test_split
68

7-
train = pd.read_csv("/kaggle/input/train.csv")
8-
test = pd.read_csv("/kaggle/input/test.csv")
9-
submission = pd.read_csv("/kaggle/input/sample_submission.csv")
109

10+
def preprocess_script():
11+
"""
12+
This method applies the preprocessing steps to the training, validation, and test datasets.
13+
"""
14+
if os.path.exists("/kaggle/input/X_train.pkl"):
15+
X_train = pd.read_pickle("/kaggle/input/X_train.pkl")
16+
X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl")
17+
y_train = pd.read_pickle("/kaggle/input/y_train.pkl")
18+
y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl")
19+
X_test = pd.read_pickle("/kaggle/input/X_test.pkl")
20+
others = pd.read_pickle("/kaggle/input/others.pkl")
1121

12-
features = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
13-
target = train[features]
22+
return X_train, X_valid, y_train, y_valid, X_test, *others
1423

24+
def data_cleaner(text):
25+
text = text.strip()
26+
text = re.sub(r"\n", "", text)
27+
text = text.lower()
28+
return text
1529

16-
text_train = train["full_text"]
17-
text_test = test["full_text"]
30+
# train
31+
train = pd.read_csv("/kaggle/input/train.csv")
32+
test = pd.read_csv("/kaggle/input/test.csv")
1833

19-
text = pd.concat([text_train, text_test], ignore_index=True)
34+
train["full_text"] = train["full_text"].apply(data_cleaner)
35+
test["full_text"] = test["full_text"].apply(data_cleaner)
2036

37+
y_train = train[["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]]
2138

22-
count_words = text.str.findall(r"(\w+)").str.len()
23-
print(count_words.sum())
39+
vectorizer = TfidfVectorizer()
40+
X_train = vectorizer.fit_transform(train["full_text"])
41+
X_test = vectorizer.transform(test["full_text"])
2442

43+
X_train = pd.DataFrame.sparse.from_spmatrix(X_train)
44+
X_test = pd.DataFrame.sparse.from_spmatrix(X_test)
2545

26-
""" Cleaning Text """
27-
text = text.str.lower()
46+
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
2847

29-
# removing special characters and numbers
30-
text = text.apply(lambda x: re.sub("[^a-z]\s", "", x))
31-
32-
# remove hash tags
33-
text = text.str.replace("#", "")
34-
35-
# remove words less than 3 character and greater than 7
36-
text = text.apply(lambda x: " ".join([w for w in x.split() if len(w) > 2 and len(w) < 8]))
37-
38-
# removing stopwords
39-
# text = text.apply(lambda x : " ".join(word for word in x.split() if word not in stopwords ))
40-
41-
count_words = text.str.findall(r"(\w+)").str.len()
42-
print(count_words.sum())
43-
44-
45-
most_freq_words = pd.Series(" ".join(text).lower().split()).value_counts()[:25]
46-
text = text.apply(lambda x: " ".join(word for word in x.split() if word not in most_freq_words))
47-
48-
count_words = text.str.findall(r"(\w+)").str.len()
49-
50-
apostrophe_dict = {
51-
"ain't": "am not / are not",
52-
"aren't": "are not / am not",
53-
"can't": "cannot",
54-
"can't've": "cannot have",
55-
"'cause": "because",
56-
"could've": "could have",
57-
"couldn't": "could not",
58-
"couldn't've": "could not have",
59-
"didn't": "did not",
60-
"doesn't": "does not",
61-
"don't": "do not",
62-
"hadn't": "had not",
63-
"hadn't've": "had not have",
64-
"hasn't": "has not",
65-
"haven't": "have not",
66-
"he'd": "he had / he would",
67-
"he'd've": "he would have",
68-
"he'll": "he shall / he will",
69-
"he'll've": "he shall have / he will have",
70-
"he's": "he has / he is",
71-
"how'd": "how did",
72-
"how'd'y": "how do you",
73-
"how'll": "how will",
74-
"how's": "how has / how is",
75-
"i'd": "I had / I would",
76-
"i'd've": "I would have",
77-
"i'll": "I shall / I will",
78-
"i'll've": "I shall have / I will have",
79-
"i'm": "I am",
80-
"i've": "I have",
81-
"isn't": "is not",
82-
"it'd": "it had / it would",
83-
"it'd've": "it would have",
84-
"it'll": "it shall / it will",
85-
"it'll've": "it shall have / it will have",
86-
"it's": "it has / it is",
87-
"let's": "let us",
88-
"ma'am": "madam",
89-
"mayn't": "may not",
90-
"might've": "might have",
91-
"mightn't": "might not",
92-
"mightn't've": "might not have",
93-
"must've": "must have",
94-
"mustn't": "must not",
95-
"mustn't've": "must not have",
96-
"needn't": "need not",
97-
"needn't've": "need not have",
98-
"o'clock": "of the clock",
99-
"oughtn't": "ought not",
100-
"oughtn't've": "ought not have",
101-
"shan't": "shall not",
102-
"sha'n't": "shall not",
103-
"shan't've": "shall not have",
104-
"she'd": "she had / she would",
105-
"she'd've": "she would have",
106-
"she'll": "she shall / she will",
107-
"she'll've": "she shall have / she will have",
108-
"she's": "she has / she is",
109-
"should've": "should have",
110-
"shouldn't": "should not",
111-
"shouldn't've": "should not have",
112-
"so've": "so have",
113-
"so's": "so as / so is",
114-
"that'd": "that would / that had",
115-
"that'd've": "that would have",
116-
"that's": "that has / that is",
117-
"there'd": "there had / there would",
118-
"there'd've": "there would have",
119-
"there's": "there has / there is",
120-
"they'd": "they had / they would",
121-
"they'd've": "they would have",
122-
"they'll": "they shall / they will",
123-
"they'll've": "they shall have / they will have",
124-
"they're": "they are",
125-
"they've": "they have",
126-
"to've": "to have",
127-
"wasn't": "was not",
128-
"we'd": "we had / we would",
129-
"we'd've": "we would have",
130-
"we'll": "we will",
131-
"we'll've": "we will have",
132-
"we're": "we are",
133-
"we've": "we have",
134-
"weren't": "were not",
135-
"what'll": "what shall / what will",
136-
"what'll've": "what shall have / what will have",
137-
"what're": "what are",
138-
"what's": "what has / what is",
139-
"what've": "what have",
140-
"when's": "when has / when is",
141-
"when've": "when have",
142-
"where'd": "where did",
143-
"where's": "where has / where is",
144-
"where've": "where have",
145-
"who'll": "who shall / who will",
146-
"who'll've": "who shall have / who will have",
147-
"who's": "who has / who is",
148-
"who've": "who have",
149-
"why's": "why has / why is",
150-
"why've": "why have",
151-
"will've": "will have",
152-
"won't": "will not",
153-
"won't've": "will not have",
154-
"would've": "would have",
155-
"wouldn't": "would not",
156-
"wouldn't've": "would not have",
157-
"y'all": "you all",
158-
"y'all'd": "you all would",
159-
"y'all'd've": "you all would have",
160-
"y'all're": "you all are",
161-
"y'all've": "you all have",
162-
"you'd": "you had / you would",
163-
"you'd've": "you would have",
164-
"you'll": "you shall / you will",
165-
"you'll've": "you shall have / you will have",
166-
"you're": "you are",
167-
"you've": "you have",
168-
}
169-
170-
171-
def lookup_dict(txt, dictionary):
172-
for word in txt.split():
173-
if word.lower() in dictionary:
174-
if word.lower() in txt.split():
175-
txt = txt.replace(word, dictionary[word.lower()])
176-
return txt
177-
178-
179-
text = text.apply(lambda x: lookup_dict(x, apostrophe_dict))
180-
181-
# Remove rare words
182-
from collections import Counter
183-
from itertools import chain
184-
185-
# split words into lists
186-
v = text.str.split().tolist()
187-
# compute global word frequency
188-
c = Counter(chain.from_iterable(v))
189-
# filter, join, and re-assign
190-
text = [" ".join([j for j in i if c[j] > 1]) for i in v]
191-
text = pd.Series(text)
192-
193-
total_word = 0
194-
for x, word in enumerate(text):
195-
num_word = len(word.split())
196-
# print(num_word)
197-
total_word = total_word + num_word
198-
print(total_word)
48+
return X_train, X_valid, y_train, y_valid, X_test
Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,23 @@
1-
import numpy as np
21
import pandas as pd
3-
from sklearn.feature_extraction.text import TfidfVectorizer
42

3+
"""
4+
Here is the feature engineering code for each task, with a class that has a fit and transform method.
5+
Remember
6+
"""
57

6-
class TfidfFeature:
8+
9+
class IdentityFeature:
710
def fit(self, train_df: pd.DataFrame):
8-
train_df = np.array(train_df).tolist()
9-
train_X = list(map("".join, train_df))
10-
self.model = TfidfVectorizer(stop_words="english", max_df=0.5, min_df=0.01).fit(train_X)
11-
# print(self.model.get_feature_names_out()[:5])
11+
"""
12+
Fit the feature engineering model to the training data.
13+
"""
14+
pass
1215

1316
def transform(self, X: pd.DataFrame):
14-
X = np.array(X).tolist()
15-
X = list(map("".join, X))
16-
return self.model.transform(X)
17+
"""
18+
Transform the input data.
19+
"""
20+
return X
21+
22+
23+
feature_engineering_cls = IdentityFeature

rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model.py

Lines changed: 0 additions & 18 deletions
This file was deleted.
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import numpy as np
2+
import pandas as pd
3+
from sklearn.ensemble import RandomForestRegressor
4+
5+
6+
def select(X: pd.DataFrame) -> pd.DataFrame:
7+
"""
8+
Select relevant features. To be used in fit & predict function.
9+
"""
10+
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
11+
return X
12+
13+
14+
def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
15+
"""
16+
Define and train the Random Forest model. Merge feature selection into the pipeline.
17+
"""
18+
# Initialize the Random Forest model
19+
model = RandomForestRegressor(n_estimators=100, random_state=32, n_jobs=-1)
20+
21+
# Select features (if any feature selection is needed)
22+
X_train_selected = select(X_train)
23+
24+
# Fit the model
25+
model.fit(X_train_selected, y_train)
26+
27+
return model
28+
29+
30+
def predict(model, X):
31+
"""
32+
Keep feature selection's consistency and make predictions.
33+
"""
34+
# Select features (if any feature selection is needed)
35+
X_selected = select(X)
36+
37+
# Predict using the trained model
38+
y_pred = model.predict(X_selected)
39+
40+
return y_pred
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
"""
2+
motivation of the model
3+
"""
4+
5+
import pandas as pd
6+
import xgboost as xgb
7+
from sklearn.multioutput import MultiOutputRegressor
8+
9+
10+
def select(X: pd.DataFrame) -> pd.DataFrame:
11+
# Ignore feature selection logic
12+
return X
13+
14+
15+
def is_sparse_df(df: pd.DataFrame) -> bool:
16+
# 检查 DataFrame 中的每一列是否为稀疏类型
17+
return any(isinstance(dtype, pd.SparseDtype) for dtype in df.dtypes)
18+
19+
20+
def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame):
21+
"""Define and train the model. Merge feature_select"""
22+
X_train = select(X_train)
23+
24+
xgb_estimator = xgb.XGBRegressor(n_estimators=500, random_state=0, objective="reg:squarederror")
25+
26+
model = MultiOutputRegressor(xgb_estimator, n_jobs=2)
27+
28+
if is_sparse_df(X_train):
29+
X_train = X_train.sparse.to_coo()
30+
31+
model.fit(X_train, y_train)
32+
return model
33+
34+
35+
def predict(model, X_test):
36+
"""
37+
Keep feature select's consistency.
38+
"""
39+
X_test = select(X_test)
40+
if is_sparse_df(X_test):
41+
X_test = X_test.sparse.to_coo()
42+
y_pred = model.predict(X_test)
43+
return y_pred

0 commit comments

Comments
 (0)