DATA ANALYSIS WITH PYTHON
PYTHON DATABASE API
CONNECTION OBJECTS
_DATABASE CONNECT
_MANAGE TRANSACTION
CURSOR OBJECT
_DATABASE QUERIES
CONNECTION METHODS
_CURSOR()
_COMMIT()
_ROLLBACK()
_CL0SE()
WRITING CODE USING DB-API
>from dmodule import connect
>connection = connect('databasename', 'username', 'pswd')
>cursor = connection.cursor()
>cursor.execute('select * from mytable')
>results = cursor.fetchall()
>cursor.close()
>connection.close()
DATA WRANGLING
MISSING VALUES
_df.dropna(subset=["price"], axis = 0, inplace =True)
_df.replace(missing_value, new_value)
_mean = df["normalized-loss"].mean()
_df["normalized-loss"].replace(np.nan, mean)
DATA FORMATTING
_df.astype()
DATA NORMALIZATION
METHODS OF NORMALIZIMG
_SIMPLE FEATURE SCALING(xnew = xold/xmax)
_MIN-MAX(xnew=xold-xmin/xmax-xmin)
_Z-SCORE(xnew=xold-meo/sigma)
BINNING
_GROUPING OF VALUES INTO BINS
_CONVERTING NUMERIC INTO CATEGORICAL VARIABLES
_bins = np.linespace(min(df["price"]), max(df["price"]), 4)
_group_names = ["Low", "Medium", "High"]
_df["price-binned"] = pd.cut(df["price"], bins, labels= group_names,
include_lowest=True)
HOW TO TURN CATEGORICAL VARIABLES INTO QUANTITATIVE VARIABLES:
DUMMMY VARIABLES
_pd.get_dummies(df['fuel'])
EXPLORATORY DATA ANALYSIS
DESCRIPTIVE STATISTICS
_value_counts()
_BOX PLOTS
_sns.boxplot(x= "drive-wheels", y="price", data=df)
_SCATTER PLOT
>x= df["price"]
>x= df["engine size"]
>plt.scatter(x,y)
>plt.title("...")
>plt.xlabel("Engine Size")
>plt.ylabel("Price")
GROUP BY
PIVOT
HEATMAP
>plt.pcolor(df_pivot, cmap='RdBu')
>plt.colorbar()
>plt.show()
CORRELATION
PEARSON CORRELATION
_CORRELATION COEFFICIENT(CLOSE TO +1:LARGE POSITIVE RELATIONSHIP, CLOSE TO -1:LARGE
NEGATIVE RELATIONSHIP, CLOSE TO 0:NO RELATIONSHIP)
_P-VALUE(<0.001 Strong Certainty, <0.05 Moderate Certainty, <0.1 Weak Certainty,
>0.1 No Certainty)
STRONG CORRELATION
_CORRELATION COEFFICIENT CLOSE TO 1 TO -1
_P-VALUE LESS THAM 0.001
>pearson_coef, p_value = stats.pearsonr(df['horsepower'], df['price'])
CORRELATION HEATMAP
CHI-SQUARE TEST
>data = [[20, 30], # Male: [Like, Dislike]
[25, 25]] # Female: [Like, Dislike]
>df = pd.DataFrame(data, columns=["Like", "Dislike"], index=["Male", "Female"])
>chi2, p, dof, expected = chi2_contingency(df)
MODEL DEVEOLOPMENT
LINEAR REGRESSION(ONE INDEPENDENT VARIABLE)
_y=bo+b1x
_FIT
_PREDICT
>from sklearn.linear_model import LinearRegression
>lm=LinearRegression()
>X=df[['highway-mpg']]
>Y=df[['price']]
>lm.fit(X, Y)
>Yhat=lm.predict(X)
MULTIPLE LINEAR REGRESSION
_ONE CONTINOUS TARGET(Y) VARIABLE
_TWO OR MORE PREDICTOR(X) VARIABLE
_Y^=bo+b1x1+b2x2+b3x3+b4x4
_Y^=1 + 2x1 + 3x2
>Z=df[['']]
>lm.fit(Z, df[''])
>Yhat=lm.predict(X)
MODEL EVALUATION USING VISUALIZATION
>import seaborn as sns
>sns.regplot(x="hidhway-mpg", y="price", data=df)
>plt.ylim(0,)
RESIDUAL PLOT
>import seaborn as sns
>sns.residplot(df['highway-mpg'],df['price'])
DISTRIBUTION PLOT
>import seaborn as sns
>ax1= sns.distplot(df['price'], hist=False, color="r", label="Actual Value")
>sns.distplot(Yhat, hist=False, color="b", label="Fitted Values", ax= ax1])
POLYNOMIAL REGRESSION
_USEFUL FOR CURVILINEAR RELATIONSHIPS(BY SQUARING OR SETTING HIGHER-ORGER TERMS OF
PREDICTOR VARIABLES)
QUADRATIC-2ND ORDER
_Y^=bo+b1x1+b2(x1)2
CUBIC-3RD ORDER
_Y^=bo+b1x1+b2(x1)2+b3(x1)3
CALCULATE POLYNOMIAL OF 3RD ORDER
>f=np.polyfit(x,y,3)
>p=np.polyld(f)
>print(p)
MORE THAN ONE DIMENSIONS
>from sklearn.preprocessing import PolynomialFeatures
>pr=PolynomialFeatures(degree=2, include_bias=False)
>x_polly=pr.fit_transform(x[['horsepower', 'curb-weight']])
>from sklearn.preprocessing import StandardScaler
>SCALE=StandardScaler()
>SCALE.fit(x_data[['horsepower', 'curb-weight']])
>x_scale=SCALE.transform(x_data[['horsepower', 'curb-weight']])
PIPELINES
>from sklearn.preprocessing import PolynomialFeatures
>from sklearn.linear_model import LinearRegression
>from sklearn.preprocessing import StandardScaler
>from sklearn.pipeline import Pipeline
>Input=[('polynomial',PolynomialFeature(degree=2)),('scale',StandardScaler()),...
('Model',LinearRegression())
>pipe=Pipeline(Input)
>Pipe.fit(df[['']], y)
>yhat=Pipe.predict(X[[''']])
MEASURES FOR IN SAMPLE EVALUATION
TWO MEASURES
MEAN SQUARED ERROR
>from sklearn.metrics import mean_squared_error
>mean_squared_error(df['price'],Y_predict_simple_fit)
R-SQUARED
>X=df[['highway-mpg']]
>Y=df[['price']]
>lm.fit(X, Y)
>lm.score(X,y)
PRECISION AND MAKING
>import numpy as np
>new_input=np,arrange(1,101,1).reshape(-1,1)
>yhat=lm.predict(new_input)
>VISUALIZATION
MODEL EVALUATION
SPLIT DATA INTO TEST AND TRAIN
GENERALIZATION PERFORMANCE
CROSS VALIDATION
_cross_val_score()
>from sklearn.model_selection import cross_val_score
>scores= cross_val_score(lr, x_data, y_data, cv=3)
>np.mean(scores)
_cross_val_predict()
>from sklearn.model_selection import cross_val_predict
>yhat=cross_val_predict(lr2e, x_data, y_data, cv=3)
OVERFITTING, UNDERFITTING AND MODEL SELECTION
RIDGE REGRESSION
GRID SEARCH