Skip to content

added abin files #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 41 additions & 16 deletions ml-debugging/src/exercise.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,41 @@
RANDOM_STATE = 42


def pre_process_data(df: pd.DataFrame) -> pd.DataFrame:
def pre_process_data(df: pd.DataFrame, columns_to_keep: List) -> pd.DataFrame:
"""
Preprocess the Titanic dataset before it can be used to train a model.
changed this method slightly to accommodate both the training and test set
"""
processed_df = df.copy()
processed_df = processed_df.dropna(subset=["Age", "Sex", "Pclass", "Embarked"])
processed_df["Sex"] = processed_df["Sex"].map({"female": 0, "male": 1})
processed_df["Embarked"] = processed_df["Embarked"].map({"S": 0, "C": 1, "Q": 2})

# encoding Pclass
processed_df['Pclass'] = processed_df['Pclass'].map({
'1': 1,
'1st': 1,
'2': 2,
'2nd': 2,
'3': 3,
'3rd': 3
})

processed_df = processed_df.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)

# ensuring the features that we need are retained
for feature in columns_to_keep:
if feature not in columns_to_keep:
processed_df[feature] = 0 # some default value that felt not too terrible

# ensuring the ordering of the features is retained or else it messes the ouptut
processed_df = processed_df[columns_to_keep]

return processed_df


def calculate_metrics(expected: List[int], predicted: List[int]) -> Tuple[float, float, float]:
"""
Calculate accuracy, precision and recall metrics of two datasets that contain binary data.
corrected the formulas for accuracy, precision and recall
"""
true_positives, false_positives, true_negatives, false_negatives = 0, 0, 0, 0

Expand All @@ -39,9 +58,9 @@ def calculate_metrics(expected: List[int], predicted: List[int]) -> Tuple[float,
else:
false_positives += 1

accuracy = true_positives / (true_positives + false_positives)
precision = true_positives / (true_positives + false_negatives)
recall = (true_positives + true_negatives) / (true_positives + false_positives + true_negatives + false_negatives)
accuracy = (true_positives + true_negatives) / (true_positives + false_positives + true_negatives + false_negatives)
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

return accuracy, precision, recall

Expand All @@ -51,25 +70,27 @@ def main() -> None:

# Load the Titanic dataset
print("Loading the Titanic dataset...")
titanic_data = pd.read_csv("train.csv")
titanic_data = pd.read_csv(r"C:\Users\Abin\OneDrive\Desktop\ADSP\adsp_interview\live-exercises\ml-debugging\src\train.csv")

# Select features and target
target = "Survived"
features = ['Pclass', 'Sex', 'Embarked', 'Age', 'Fare']

# Preprocess the data
print("Preprocessing the data...")
titanic_data = pre_process_data(titanic_data)
titanic_data = pre_process_data(titanic_data, columns_to_keep = features + [target])

# Select features and target
target = "Survived"
X = titanic_data[titanic_data.columns]
X = titanic_data[features]
y = titanic_data[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

# Train a Random Forest classifier
print("Training a Random Forest classifier...")
model = RandomForestClassifier(n_estimators=2)
model = RandomForestClassifier(n_estimators = 500)

model.fit(X_test, y_test)
model.fit(X_train, y_train)

# Make predictions on the test set
print("Making predictions on the test set...")
Expand All @@ -83,11 +104,15 @@ def main() -> None:
print("Recall:", recall)

# Make predictions on unseen data
print("Making predictions on unseen data...")
print("Making predictions on unseen data by Abin Varghese...")
# TODO You should make predictions here against the unseen data from the test.csv file
test_data = pd.read_csv(r'C:\Users\Abin\OneDrive\Desktop\ADSP\adsp_interview\live-exercises\ml-debugging\src\test.csv')
test_data = pre_process_data(test_data, columns_to_keep = features)

test_predictions = model.predict(test_data)

return
print(f"The predictions on the test data:\n{test_predictions}")


if __name__ == "__main__":
main()
main()
8 changes: 8 additions & 0 deletions ml-debugging/src/notes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# the notes while debugging

1. train on the training data instead on testing data
2. remove target from X
3. n_estimators in random forest - could have written a code to cross validate, but chose a value that is usually accepted
4. mismatch of columns in the training and testing set
5. Encoded Pclass
6. corrected the formulas for accuracy, precision and recall
Loading