Импорт библиотек:
In [1]: import lazy_pipeline as lpipe
import [Link] as plt
import pandas as pd
import numpy as np
import time
# предобработка числовых признаков
from [Link] import KBinsDiscretizer
# используемые метрики
from [Link] import accuracy_score, f1_score
Оптимизация отображения ноутбука:
In [2]: from [Link] import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))
Используемые версии библиотек:
In [3]: from platform import python_version
import sklearn
print(python_version())
print([Link].__version__)
print(sklearn.__version__)
print(pd.__version__)
print(np.__version__)
3.7.6
1.0.1
0.22.1
1.0.1
1.18.1
Запуск baseline
In [4]: def process_data(df):
# обработка датасета, замена пустых числовых значений средними по столбцу, категориальных значениями unknown
# оставляем не более 10 наиболее популярных значения каждого категориального признака
for col in df.select_dtypes(['number']).columns:
df[col] = df[col].fillna(df[col].mean())
for col in df.select_dtypes(['object']).columns:
df[col] = df[col].fillna('unknown')
use_values = df[col].value_counts().[Link][0:10]
df[col] = df[col].apply(lambda x: x if x in use_values else 'other')
return df
def discretize_data(df):
# категоризируем числовые признаки: разбиваем на 5 интервалов равной длины
est = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
for col in df.select_dtypes(['number']).columns:
df[col] = est.fit_transform(df[[col]])
return df
def get_scores(y_preds, y_preds_fixedtrain):
# обернем в функцию расчет метрик из исходного ноутбука, принимаем метрики accuracy и f-score для дальнейшего использования
score_vals = {}
for score_f in [accuracy_score, f1_score]:
score_name = score_f.__name__
preds = y_preds
score_vals[score_name] = [score_f(y_test[:i], preds[:i]) for i in range(1, len(preds))]
score_name = score_f.__name__ + '_fixedtrain'
preds = y_preds_fixedtrain
score_vals[score_name] = [score_f(y_test[:i], preds[:i]) for i in range(1, len(preds))]
return score_vals
def get_scores_info (score_vals, t_preds, t_preds_fixedtrain):
return {'accuracy_score' : [Link](score_vals['accuracy_score']),
'accuracy_score_fixedtrain' : [Link](score_vals['accuracy_score_fixedtrain']),
'f1_score' : [Link](score_vals['f1_score']),
'f1_score_fixedtrain' : [Link](score_vals['f1_score_fixedtrain']),
't_preds': [Link](t_preds),
't_preds_fixedtrain' : [Link](t_preds_fixedtrain)}
def plot_metrics(score_vals, t_preds, t_preds_fixedtrain):
# построение графиков метрик и времени расчета
[Link]['[Link]'] = (1,1,1,1)
fig, axs = [Link](2, 2, figsize=(12, 8))
for ax, t in zip(axs[0],['accuracy_score', 'f1_score']):
ax.set_ylim(0-0.05, 1+0.05)
[Link](range(n_train+1, len(X)), score_vals[t], label='baseline clf.')
[Link](range(n_train+1, len(X)), score_vals[t+'_fixedtrain'], label='baseline clf. (fixed train)')
axs[1,0].plot(range(n_train, len(X)), t_preds, label='baseline clf.')
axs[1,0].plot(range(n_train, len(X)), t_preds_fixedtrain, label='baseline clf. (fixed train)')
for (ax, t_verb, dim) in zip([Link](), ['Accuracy score', 'F1 score', 'Prediction time'], ['','','(secs.)']):
ax.set_title('\n'.join([f"{t_verb} progression", "w.r.t. the number of train examples"]), loc='left', size=18)
ax.set_xlabel('# of train examples', size=14)
ax.set_ylabel(f"{t_verb} {dim}".strip(), size=14)
[Link]()
axs[1,1].set_axis_off()
plt.tight_layout()
plt.subplots_adjust()
[Link]()
Подготовка данных
Используется открытый датасет об удовлетворенности клиентов самолетов
[Link]
In [5]: # считываем датасет, переводим целевую переменную в булев тип, убираем служебные столбцы
data = pd.read_csv('data/[Link]')
y_name = 'satisfaction'
data[y_name] = (data[y_name]=='satisfied')
data = [Link][:,2:]
print([Link])
[Link](5)
(25976, 23)
Out[5]:
Arrival
Inflight Ease of On- Leg Departure
Customer Type of Flight Departure/Arrival Gate Inflight Baggage Checkin Inflight Delay
Gender Age Class wifi Online ... board room Cleanliness Delay in
Type Travel Distance time convenient location entertainment handling service service in
service booking service service Minutes
Minutes
Loyal Business
0 Female 52 Eco 160 5 4 3 4 ... 5 5 5 5 2 5 5 50 44.0
Customer travel
Loyal Business
1 Female 36 Business 2863 1 1 3 1 ... 4 4 4 4 3 4 5 0 0.0
Customer travel
disloyal Business
2 Male 20 Eco 192 2 0 2 4 ... 2 4 1 3 2 2 2 0 0.0
Customer travel
Loyal Business
3 Male 44 Business 3377 0 0 0 2 ... 1 1 1 1 3 1 4 0 6.0
Customer travel
Loyal Business
4 Female 49 Eco 1182 2 3 4 3 ... 2 2 2 2 4 2 4 0 20.0
Customer travel
5 rows × 23 columns
Ограничим исследуемую выборку 500 объектами и обработаем данные:
In [6]: data = [Link](500, random_state = 1)
data = process_data(data)
data = discretize_data(data)
Бинаризация данных: one-hot кодирование
In [7]: y = data[y_name]
X = lpipe.binarize_X([Link](y_name, axis=1))
print([Link])
[Link](2)
(500, 99)
Out[7]:
Customer Customer Type of Departure Departure Departure Departure Departure Arrival Arrival Arrival Arrival Arrival
Gender: Gender: Type: Type: Age: Age: Age: Age: Age: Travel: Delay in Delay in Delay in Delay in Delay in Delay in Delay in Delay in Delay in Delay in
...
Female Male Loyal disloyal 0.0 1.0 2.0 3.0 4.0 Business Minutes: Minutes: Minutes: Minutes: Minutes: Minutes: Minutes: Minutes: Minutes: Minutes:
Customer Customer travel 0.0 1.0 2.0 3.0 4.0 0.0 1.0 2.0 3.0 4.0
21362 True False True False False False False False True False ... True False False False False True False False False False
11437 False True False True False True False False False True ... True False False False False True False False False False
2 rows × 99 columns
Представление матрицы признаков как списка множеств:
In [8]: X_bin = [set([Link][x]) for idx, x in [Link]()]
X_bin[0]
Out[8]: {'Age: 4.0',
'Arrival Delay in Minutes: 0.0',
'Baggage handling: 3.0',
'Checkin service: 4.0',
'Class: Eco',
'Cleanliness: 3.0',
'Customer Type: Loyal Customer',
'Departure Delay in Minutes: 0.0',
'Departure/Arrival time convenient: 4.0',
'Ease of Online booking: 3.0',
'Flight Distance: 1.0',
'Food and drink: 2.0',
'Gate location: 0.0',
'Gender: Female',
'Inflight entertainment: 3.0',
'Inflight service: 3.0',
'Inflight wifi service: 3.0',
'Leg room service: 3.0',
'On-board service: 3.0',
'Online boarding: 3.0',
'Seat comfort: 3.0',
'Type of Travel: Personal Travel'}
Перевод целевой переменной в список:
In [9]: y = [Link]()
Предполагаем, что на начальном этапе у нас есть только 10% от общей выборки:
In [10]: n_train = int(len(X)*0.1)
n_test = len(X) - n_train
y_test = y[n_train:]
n_train, n_test
Out[10]: (50, 450)
Применение модели
In [11]: %%time
gen = lpipe.predict_array(X_bin, y, n_train, use_tqdm=True)
y_preds, t_preds = lpipe.apply_stopwatch(gen)
# обновляем обучающую выборку
Predicting step by step: 100%|███████████████████████████████████████████████████████| 500/500 [00:07<00:00, 71.22it/s]
Wall time: 7.02 s
In [12]: %%time
gen = list(lpipe.predict_array(X_bin, y, n_train, use_tqdm=True, update_train=False))
y_preds_fixedtrain, t_preds_fixedtrain = lpipe.apply_stopwatch(gen)
# не обновляем обучающую выборку
Predicting step by step: 100%|█████████████████████████████████████████████████████| 500/500 [00:00<00:00, 2110.59it/s]
Wall time: 241 ms
In [13]: scores = get_scores(y_preds, y_preds_fixedtrain)
plot_metrics(scores, t_preds, t_preds_fixedtrain)
In [14]: get_scores_info(scores, t_preds, t_preds_fixedtrain)
Out[14]: {'accuracy_score': 0.7562200315830624,
'accuracy_score_fixedtrain': 0.7212799822789326,
'f1_score': 0.7625785966832718,
'f1_score_fixedtrain': 0.7376621185160739,
't_preds': 0.01560578982035319,
't_preds_fixedtrain': 0.0}
Модификация алгоритма
In [16]: # используемые алгоритмы
import lightgbm as lgb
from lazy_pipeline import predict_with_generators
from [Link] import DecisionTreeClassifier
Вместо пересечения множеств используем функцию матричного умножения в numpy. Добавим возможность использовать алгоритмы (дерево решений и градиентный бустинг) в lazy
pipeline.
In [17]: def predict_with_dot(x, X_train, Y_train):
X_pos = [Link]([x_train for x_train, y in zip(X_train, Y_train) if y])
X_neg = [Link]([x_train for x_train, y in zip(X_train, Y_train) if not y])
pos_dot = [Link](x,X_pos.T).sum()
neg_dot = [Link](x,X_neg.T).sum()
return pos_dot > neg_dot
def predict_with_model(x, X_train, Y_train, use_model):
model = use_model
[Link](X_train,Y_train)
res = [Link]([x])
return res[0]
# используем для тестирования дерево решений, случайный лес, и градиентный бустинг (без какой-либо настройки параметров моделей)
def predict_with_tree(x, X_train, Y_train):
return predict_with_model(x, X_train, Y_train, DecisionTreeClassifier(max_depth = 6))
def predict_with_boosting(x, X_train, Y_train):
return predict_with_model(x, X_train, Y_train, [Link](max_depth =4, n_estimators = 100))
def train_lpipe(X, y, n_train, predict_function, update_train):
gen = lpipe.predict_array(X = X, Y = y, n_train = n_train, use_tqdm = False, predict_func = predict_function, update_train = update_train )
y_preds, t_preds = lpipe.apply_stopwatch(gen)
return y_preds, t_preds
def get_results(predict_function, X, y, n_train):
results = {}
results['model_name'] = predict_function.__name__
t_start = [Link]()
y_preds, t_preds = train_lpipe(X = X, y = y, n_train = n_train, predict_function = predict_function, update_train = True)
t_stop = [Link]()
results['upd_time'] = t_stop - t_start
t_start = [Link]()
y_preds_fixedtrain, t_preds_fixedtrain = train_lpipe(X = X, y = y, n_train = n_train, predict_function = predict_function, update_train = False)
t_stop = [Link]()
results['fixed_time'] = t_stop - t_start
scores = get_scores(y_preds, y_preds_fixedtrain)
scores_info = get_scores_info(scores, t_preds, t_preds_fixedtrain)
for key in scores_info:
results[key] = scores_info[key]
return results
In [18]: # переведем все данные в числовой формат
X_int = [Link](int).values
In [19]: df_results = [Link]([get_results(predict_function, X_int, y, n_train) for predict_function in [predict_with_dot, predict_with_tree, predict_with_boostin
g]])
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1515: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to n
o true nor predicted samples. Use `zero_division` parameter to control this behavior.
average, "true nor predicted", 'F-score is', len(true_sum)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1515: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to n
o true nor predicted samples. Use `zero_division` parameter to control this behavior.
average, "true nor predicted", 'F-score is', len(true_sum)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1515: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to n
o true nor predicted samples. Use `zero_division` parameter to control this behavior.
average, "true nor predicted", 'F-score is', len(true_sum)
In [20]: # итоговая таблица для сравнения
df_results
Out[20]:
model_name upd_time fixed_time accuracy_score accuracy_score_fixedtrain f1_score f1_score_fixedtrain t_preds t_preds_fixedtrain
0 predict_with_dot 0.148907 0.037977 0.747410 0.817870 0.581582 0.741756 0.000331 0.000084
1 predict_with_tree 0.727553 0.263840 0.836064 0.785045 0.803344 0.728630 0.001617 0.000584
2 predict_with_boosting 10.372624 4.307353 0.862641 0.807403 0.823205 0.762033 0.023050 0.009572