-
Notifications
You must be signed in to change notification settings - Fork 11.5k
Expand file tree
/
Copy pathlasso_model.py
More file actions
139 lines (111 loc) · 3.83 KB
/
lasso_model.py
File metadata and controls
139 lines (111 loc) · 3.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import numpy as np
import polars as pl
from sklearn.linear_model import Lasso # type: ignore
from vnpy.alpha import (
AlphaDataset,
AlphaModel,
Segment,
logger
)
class LassoModel(AlphaModel):
"""LASSO regression learning algorithm"""
def __init__(
self,
alpha: float = 0.0005,
max_iter: int = 1000,
random_state: int | None = None,
) -> None:
"""
Parameters
----------
alpha : float
Regularization parameter
max_iter : int
Maximum number of iterations
random_state : int
Random seed
"""
self.alpha: float = alpha
self.max_iter: int = max_iter
self.random_state: int | None = random_state
self.model: Lasso = None
self.feature_names: list[str] = []
def fit(self, dataset: AlphaDataset) -> None:
"""
Fit the model with dataset
Parameters
----------
dataset : AlphaDataset
The dataset used for training
"""
# Get training data
df_train: pl.DataFrame = dataset.fetch_learn(Segment.TRAIN)
df_valid: pl.DataFrame = dataset.fetch_learn(Segment.VALID)
# Merge data, remove duplicates and sort
df_train = pl.concat([df_train, df_valid])
df_train = df_train.unique(subset=["datetime", "vt_symbol"])
df_train = df_train.sort(["datetime", "vt_symbol"])
# Extract feature names
self.feature_names = df_train.columns[2:-1]
# Convert to numpy arrays
X: np.ndarray = df_train.select(self.feature_names).to_numpy()
y: np.ndarray = np.array(df_train["label"])
# Create and train the model
self.model = Lasso(
alpha=self.alpha,
max_iter=self.max_iter,
random_state=self.random_state,
fit_intercept=False,
copy_X=False
)
self.model.fit(X, y)
def predict(self, dataset: AlphaDataset, segment: Segment) -> np.ndarray:
"""
Make predictions using the model
Parameters
----------
dataset : AlphaDataset
The dataset used for prediction
segment : Segment
The segment of data to use for prediction
Returns
-------
np.ndarray
Prediction results
Raises
------
ValueError
If the model has not been fitted yet
"""
# Check if model exists
if self.model is None:
raise ValueError("model is not fitted yet!")
# Get data for prediction
df: pl.DataFrame = dataset.fetch_infer(segment)
df = df.sort(["datetime", "vt_symbol"])
# Convert to numpy array
data: np.ndarray = df.select(df.columns[2: -1]).to_numpy()
# Return prediction results
result: np.ndarray = self.model.predict(data)
return result
def detail(self) -> None:
"""
Output detailed information about the model
Displays feature importance based on the coefficients
of the LASSO model, showing only non-zero features
sorted by absolute value.
"""
# Get feature coefficients
coef: np.ndarray = self.model.coef_
# Extract feature coefficients
data: list[tuple[str, float]] = list(zip(self.feature_names, coef, strict=False))
# Filter non-zero features
data = [x for x in data if x[1]]
# Sort by absolute value
data.sort(key=lambda x: abs(x[1]), reverse=True)
# Filter out features with very small coefficients
data = [x for x in data if round(x[1], 6) != 0]
# Print feature importance
logger.info(f"LASSO模型特征总数量: {len(data)}")
for name, importance in data:
logger.info(f"{name}: {importance:.6f}")