import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error
df = pd.read_csv('SALARY2.csv')
df.head()
df_binary = df[['Years of Experience', 'Salary']]
sns.lmplot(x ="Years of Experience", y ="Salary", data = df_binary, order = 2,
ci = None)
plt.show()
df_binary.fillna(method ='ffill', inplace = True)
X = np.array(df_binary['Years of Experience']).reshape(-1, 1)
y = np.array(df_binary['Salary']).reshape(-1, 1)
# Separating the data into independent and dependent variables
# Converting each dataframe into a numpy array
# since each dataframe contains only one column
df_binary.dropna(inplace = True)
# Dropping any rows with Nan values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
# Splitting the data into training and testing data
regr = LinearRegression()
regr.fit(X_train, y_train)
print('Coefficient of determination: %.2f' % regr.score(X_test, y_test))
y_pred = regr.predict(X_test)
plt.scatter(X_test, y_test, color ='b')
plt.plot(X_test, y_pred, color ='k')
plt.show()
mae = mean_absolute_error(y_true=y_test,y_pred=y_pred)
#squared True returns MSE value, False returns RMSE value.
mse = mean_squared_error(y_true=y_test,y_pred=y_pred) #default=True
print("MAE:",mae)
print("MSE:",mse)
Output:
Coefficient of determination:0.27853954081632637
MAE: 1484.71615720524
MSE: 2808565.8168227146
Dataset:
Years of
Experienc
e Salary
5 5000
3 6000
15 4000
7 3000
20 2000