Skip to content
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
  • Loading branch information
pre-commit-ci[bot] committed Mar 29, 2026
commit 0776097bc5481227a79a9d06f4c7458d5149c147
32 changes: 16 additions & 16 deletions machine_learning/decision_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
Output: The decision tree maps a real number input to a real number output.
"""

import numpy as np
from collections import Counter

Check failure on line 8 in machine_learning/decision_tree.py

View workflow job for this annotation

GitHub Actions / ruff

ruff (I001)

machine_learning/decision_tree.py:7:1: I001 Import block is un-sorted or un-formatted help: Organize imports


class DecisionTree:
Expand All @@ -18,7 +18,7 @@
self.prediction = None
self.task = task
self.criterion = criterion

def mean_squared_error(self, labels, prediction):
"""
mean_squared_error:
Expand Down Expand Up @@ -51,23 +51,23 @@
would be incorrectly classified.
Formula: Gini = 1 - sum(p_i^2)
where p_i is the probability of class i.

Lower Gini value indicates better purity (best split).
"""
classes, counts = np.unique(y, return_counts=True)

Check failure on line 57 in machine_learning/decision_tree.py

View workflow job for this annotation

GitHub Actions / ruff

ruff (RUF059)

machine_learning/decision_tree.py:57:9: RUF059 Unpacked variable `classes` is never used help: Prefix it with an underscore or any other dummy variable pattern
prob = counts / counts.sum()
return 1 - np.sum(prob ** 2)
return 1 - np.sum(prob**2)

def entropy(self, y):
"""
Computes the entropy (impurity) of a set of labels.
Entropy measures the randomness or disorder in the data.
Formula: Entropy = - sum(p_i * log2(p_i))
where p_i is the probability of class i.

Lower entropy means higher purity.
"""
classes, counts = np.unique(y, return_counts=True)

Check failure on line 70 in machine_learning/decision_tree.py

View workflow job for this annotation

GitHub Actions / ruff

ruff (RUF059)

machine_learning/decision_tree.py:70:9: RUF059 Unpacked variable `classes` is never used help: Prefix it with an underscore or any other dummy variable pattern
prob = counts / counts.sum()
return -np.sum(prob * np.log2(prob + 1e-9))

Expand All @@ -76,8 +76,8 @@
Computes the information gain from splitting a dataset.
Information gain represents the reduction in impurity
after a dataset is split into left and right subsets.
Formula: IG = Impurity(parent) - [weighted impurity(left) + weighted impurity(right)]

Check failure on line 79 in machine_learning/decision_tree.py

View workflow job for this annotation

GitHub Actions / ruff

ruff (E501)

machine_learning/decision_tree.py:79:89: E501 Line too long (93 > 88)

Higher information gain indicates a better split.
"""
if self.criterion == "gini":
Expand All @@ -90,9 +90,7 @@
weight_l = len(left) / len(parent)
weight_r = len(right) / len(parent)

return func(parent) - (
weight_l * func(left) + weight_r * func(right)
)
return func(parent) - (weight_l * func(left) + weight_r * func(right))

def most_common_label(self, y):
return Counter(y).most_common(1)[0][0]
Expand Down Expand Up @@ -150,17 +148,17 @@
return

best_split = 0

"""
loop over all possible splits for the decision tree. find the best split.
if no split exists that is less than 2 * error for the entire array
then the data set is not split and the average for the entire array is used as
the predictor
"""
if self.task == "regression":
best_score = float("inf")
else:
best_score = -float("inf")

Check failure on line 161 in machine_learning/decision_tree.py

View workflow job for this annotation

GitHub Actions / ruff

ruff (SIM108)

machine_learning/decision_tree.py:158:9: SIM108 Use ternary operator `best_score = float("inf") if self.task == "regression" else -float("inf")` instead of `if`-`else`-block help: Replace `if`-`else`-block with `best_score = float("inf") if self.task == "regression" else -float("inf")`

for i in range(len(x)):
if len(x[:i]) < self.min_leaf_size:
Expand All @@ -180,7 +178,7 @@
best_score = score
best_split = i

else:
else:
gain = self.information_gain(y, left_y, right_y)

if gain > best_score:
Expand Down Expand Up @@ -211,8 +209,8 @@
self.left.train(left_x, left_y)
self.right.train(right_x, right_y)

else:
if self.task == "regression":

Check failure on line 213 in machine_learning/decision_tree.py

View workflow job for this annotation

GitHub Actions / ruff

ruff (PLR5501)

machine_learning/decision_tree.py:212:9: PLR5501 Use `elif` instead of `else` then `if`, to reduce indentation help: Convert to `elif`
self.prediction = np.mean(y)
else:
self.prediction = self.most_common_label(y)
Expand All @@ -234,7 +232,7 @@

raise ValueError("Decision tree not yet trained")


class TestDecisionTree:
"""Decision Tres test class"""

Expand All @@ -252,7 +250,7 @@

return float(squared_error_sum / labels.size)


def main():
"""
In this demonstration we're generating a sample data set from the sin function in
Expand All @@ -270,15 +268,17 @@
x_cls = np.array([1, 2, 3, 4, 5, 6])
y_cls = np.array([0, 0, 0, 1, 1, 1])

clf = DecisionTree(depth=3, min_leaf_size=1, task="classification", criterion="gini")
clf = DecisionTree(
depth=3, min_leaf_size=1, task="classification", criterion="gini"
)
clf.train(x_cls, y_cls)

print("Classification prediction (2):", clf.predict(2))
print("Classification prediction (5):", clf.predict(5))
print("Classification prediction (2):", clf.predict(2))
print("Classification prediction (5):", clf.predict(5))


if __name__ == "__main__":
main()
import doctest

doctest.testmod(name="mean_squared_error", verbose=True)
doctest.testmod(name="mean_squared_error", verbose=True)
Loading