keyboard_arrow_down DISHANT KUMAR YADAV 2021BCS0136
#DISHANT KUMAR YADAV
import numpy as np
import pandas as pd
df = pd.read_csv('/content/sample_data/Salary_Data.csv')
df
#DISHANT KUMAR YADAV
YearsExperience Salary
0 1.1 39343.0
1 1.3 46205.0
2 1.5 37731.0
3 2.0 43525.0
4 2.2 39891.0
5 2.9 56642.0
6 3.0 60150.0
7 3.2 54445.0
8 3.2 64445.0
9 3.7 57189.0
10 3.9 63218.0
11 4.0 55794.0
12 4.0 56957.0
13 4.1 57081.0
14 4.5 61111.0
15 4.9 67938.0
16 5.1 66029.0
17 5.3 83088.0
18 5.9 81363.0
19 6.0 93940.0
20 6.8 91738.0
21 7.1 98273.0
22 7.9 101302.0
23 8.2 113812.0
24 8.7 109431.0
25 9.0 105582.0
26 9.5 116969.0
27 9.6 112635.0
28 10.3 122391.0
29 10.5 121872.0
#DISHANT KUMAR YADAV
import matplotlib.pyplot as plt
exp = df['YearsExperience']
sal = df['Salary']
plt.scatter(exp,sal)
plt.xlabel('Experience')
plt.ylabel('Salary')
#DISHANT KUMAR YADAV
Text(0, 0.5, 'Salary')
#DISHANT KUMAR YADAV
exp_np = exp.to_numpy()
sal_np = sal.to_numpy()
exp_np.shape, sal_np.shape
#DISHANT KUMAR YADAV
((30,), (30,))
#DISHANT KUMAR YADAV
from sklearn.linear_model import LinearRegression
sklearn_model = LinearRegression().fit(exp_np.reshape((30,1)), sal_np)
sklearn_sal_predictions = sklearn_model.predict(exp_np.reshape((30,1)))
sklearn_sal_predictions.shape
#DISHANT KUMAR YADAV
(30,)
#DISHANT KUMAR YADAV
exp = df['YearsExperience']
sal = df['Salary']
plt.scatter(exp,sal)
plt.xlabel('Experience')
plt.ylabel('Salary')
plt.scatter(exp,sklearn_sal_predictions )
#DISHANT KUMAR YADAV
output <matplotlib.collections.PathCollection at 0x7c7e2822d360>
#DISHANT KUMAR YADAV
predictions_df = pd.DataFrame({'YearsExperience': exp, 'Salary':sal, 'Sklearn salary prediction':sklearn_sal_predictions})
predictions_df
#DISHANT KUMAR YADAV
YearsExperience Salary Sklearn salary prediction
0 1.1 39343.0 36187.158752
1 1.3 46205.0 38077.151217
2 1.5 37731.0 39967.143681
3 2.0 43525.0 44692.124842
4 2.2 39891.0 46582.117306
5 2.9 56642.0 53197.090931
6 3.0 60150.0 54142.087163
7 3.2 54445.0 56032.079627
8 3.2 64445.0 56032.079627
9 3.7 57189.0 60757.060788
10 3.9 63218.0 62647.053252
11 4.0 55794.0 63592.049484
12 4.0 56957.0 63592.049484
13 4.1 57081.0 64537.045717
14 4.5 61111.0 68317.030645
15 4.9 67938.0 72097.015574
16 5.1 66029.0 73987.008038
17 5.3 83088.0 75877.000502
18 5.9 81363.0 81546.977895
19 6.0 93940.0 82491.974127
20 6.8 91738.0 90051.943985
21 7.1 98273.0 92886.932681
22 7.9 101302.0 100446.902538
23 8.2 113812.0 103281.891235
24 8.7 109431.0 108006.872395
25 9.0 105582.0 110841.861092
26 9.5 116969.0 115566.842252
27 9.6 112635.0 116511.838485
28 10.3 122391.0 123126.812110
29 10.5 121872.0 125016.804574
keyboard_arrow_down DISHANT KUMAR YADAV 2021BCS0136
# Step 1: Import the required python packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
# Step 2: Load the dataset
df = pd.read_csv('/content/sample_data/Salary_Data.csv')
# Step 3: Data analysis - distribution plot shows the variation in the data distribution.
exp = df['YearsExperience']
sal = df['Salary']
plt.scatter(exp, sal)
plt.xlabel('Experience')
plt.ylabel('Salary')
plt.title('Distribution of Experience vs. Salary')
plt.show()
output
# Step 4: Split the dataset into dependent/independent variables
X = df[['YearsExperience']]
y = df['Salary']
# Step 5: Split data into Train/Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Step 6: Train the regression model
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)
▾ LinearRegression
LinearRegression()
# Step 7: Plot the training results
plt.scatter(X_train, y_train, color='blue')
plt.plot(X_train, regression_model.predict(X_train), color='red')
plt.xlabel('Experience')
plt.ylabel('Salary')
plt.title('Training Results: Experience vs. Salary')
plt.show()
# Step 7: Plot the test results
plt.scatter(X_test, y_test, color='blue')
plt.plot(X_train, regression_model.predict(X_train), color='red') # Same line as training for comparison
plt.xlabel('Experience')
plt.ylabel('Salary')
plt.title('Test Results: Experience vs. Salary')
plt.show()