Logistic Regression¶
In this execise, I'd like you to perform a logistic regression on collecge applicant data.
This time, no explicit instructions, but here's the documentation for a Logistic Regression from SKLearn: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
To simplify the dataset for this problem, I converted the "Chance of Admit"
into a boolean "Admitted"
column.
I've also done the train_test_split for you using sklearn's helpful helper function.
In [7]:
Copied!
# 1. Load and clean the data
# I did this part for you!
from google.colab import drive
import pandas as pd
drive.mount('/content/gdrive')
df = pd.read_csv('/content/gdrive/My Drive/datasets/admission_predict.csv')
df["Admitted"] = df["Chance of Admit "] > 0.75
df.drop("Chance of Admit ", axis=1, inplace=True)
df.rename(columns={"LOR ": "LOR"}, inplace=True)
df
# 1. Load and clean the data
# I did this part for you!
from google.colab import drive
import pandas as pd
drive.mount('/content/gdrive')
df = pd.read_csv('/content/gdrive/My Drive/datasets/admission_predict.csv')
df["Admitted"] = df["Chance of Admit "] > 0.75
df.drop("Chance of Admit ", axis=1, inplace=True)
df.rename(columns={"LOR ": "LOR"}, inplace=True)
df
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Out[7]:
Serial No. | GRE Score | TOEFL Score | University Rating | SOP | LOR | CGPA | Research | Admitted | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 337 | 118 | 4 | 4.5 | 4.5 | 9.65 | 1 | True |
1 | 2 | 324 | 107 | 4 | 4.0 | 4.5 | 8.87 | 1 | True |
2 | 3 | 316 | 104 | 3 | 3.0 | 3.5 | 8.00 | 1 | False |
3 | 4 | 322 | 110 | 3 | 3.5 | 2.5 | 8.67 | 1 | True |
4 | 5 | 314 | 103 | 2 | 2.0 | 3.0 | 8.21 | 0 | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
395 | 396 | 324 | 110 | 3 | 3.5 | 3.5 | 9.04 | 1 | True |
396 | 397 | 325 | 107 | 3 | 3.0 | 3.5 | 9.11 | 1 | True |
397 | 398 | 330 | 116 | 4 | 5.0 | 4.5 | 9.45 | 1 | True |
398 | 399 | 312 | 103 | 3 | 3.5 | 4.0 | 8.78 | 0 | False |
399 | 400 | 333 | 117 | 4 | 5.0 | 4.0 | 9.66 | 1 | True |
400 rows × 9 columns
In [24]:
Copied!
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# 2. Train/Test Split
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
feature_cols = ["GRE Score", "TOEFL Score", "University Rating", "SOP", "LOR", "CGPA", "Research"]
# 3. Create X_train, y_train, X_test, y_test variables
X_train = df_train[feature_cols]
X_test = df_test[feature_cols]
y_train = df_train["Admitted"]
y_test = df_test["Admitted"]
# 4. Train the Model
model = LogisticRegression()
model.fit(X_train, y_train)
# 5. Evaluate the Model (Report Accuracy as correct/all)
y_pred = model.predict(X_test)
df_test["Predicted Admitted"] = y_pred
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# 2. Train/Test Split
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
feature_cols = ["GRE Score", "TOEFL Score", "University Rating", "SOP", "LOR", "CGPA", "Research"]
# 3. Create X_train, y_train, X_test, y_test variables
X_train = df_train[feature_cols]
X_test = df_test[feature_cols]
y_train = df_train["Admitted"]
y_test = df_test["Admitted"]
# 4. Train the Model
model = LogisticRegression()
model.fit(X_train, y_train)
# 5. Evaluate the Model (Report Accuracy as correct/all)
y_pred = model.predict(X_test)
df_test["Predicted Admitted"] = y_pred
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1): STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. Increase the number of iterations (max_iter) or scale the data as shown in: https://scikit-learn.org/stable/modules/preprocessing.html Please also refer to the documentation for alternative solver options: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression n_iter_i = _check_optimize_result(
In [25]:
Copied!
correct_pred = len(df_test[df_test["Admitted"] == df_test["Predicted Admitted"]])
total_pred = len(df_test)
accuracy = correct_pred / total_pred
true_positives = len(df_test[(df_test["Admitted"] == True) & (df_test["Predicted Admitted"] == True)])
print(f"True Positives {true_positives}")
true_negatives = len(df_test[(df_test["Admitted"] == False) & (df_test["Predicted Admitted"] == False)])
print(f"True Negatives {true_negatives}")
false_positives = len(df_test[(df_test["Admitted"] == False) & (df_test["Predicted Admitted"] == True)])
print(f"False Positives {false_positives}")
false_negatives = len(df_test[(df_test["Admitted"] == True) & (df_test["Predicted Admitted"] == False)])
print(f"False Negatives {false_negatives}")
precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
print(f"Accuracy is {accuracy * 100}%")
print(f"Precision: {precision * 100}%")
print(f"Recall: {recall * 100}%")
correct_pred = len(df_test[df_test["Admitted"] == df_test["Predicted Admitted"]])
total_pred = len(df_test)
accuracy = correct_pred / total_pred
true_positives = len(df_test[(df_test["Admitted"] == True) & (df_test["Predicted Admitted"] == True)])
print(f"True Positives {true_positives}")
true_negatives = len(df_test[(df_test["Admitted"] == False) & (df_test["Predicted Admitted"] == False)])
print(f"True Negatives {true_negatives}")
false_positives = len(df_test[(df_test["Admitted"] == False) & (df_test["Predicted Admitted"] == True)])
print(f"False Positives {false_positives}")
false_negatives = len(df_test[(df_test["Admitted"] == True) & (df_test["Predicted Admitted"] == False)])
print(f"False Negatives {false_negatives}")
precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
print(f"Accuracy is {accuracy * 100}%")
print(f"Precision: {precision * 100}%")
print(f"Recall: {recall * 100}%")
True Positives 30 True Negatives 40 False Positives 8 False Negatives 2 Accuracy is 87.5% Precision: 78.94736842105263% Recall: 93.75%
In [ ]:
Copied!