Skip to content

Commit

Permalink
feat: allow list-like complexity_of_variables
Browse files Browse the repository at this point in the history
  • Loading branch information
MilesCranmer committed Jun 16, 2024
1 parent a548719 commit 3a3b168
Showing 1 changed file with 98 additions and 12 deletions.
110 changes: 98 additions & 12 deletions pysr/sr.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ def _check_assertions(
X,
use_custom_variable_names,
variable_names,
complexity_of_variables,
weights,
y,
X_units,
Expand All @@ -161,6 +162,13 @@ def _check_assertions(
"and underscores are allowed."
)
assert_valid_sympy_symbol(var_name)
if (
isinstance(complexity_of_variables, list)
and len(complexity_of_variables) != X.shape[1]
):
raise ValueError(
"The number of elements in `complexity_of_variables` must equal the number of features in `X`."
)
if X_units is not None and len(X_units) != X.shape[1]:
raise ValueError(
"The number of units in `X_units` must equal the number of features in `X`."
Expand Down Expand Up @@ -331,7 +339,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
`idx` argument to the function, which is `nothing`
for non-batched, and a 1D array of indices for batched.
Default is `None`.
complexity_of_operators : dict[str, float]
complexity_of_operators : dict[str, Union[int, float]]
If you would like to use a complexity other than 1 for an
operator, specify the complexity here. For example,
`{"sin": 2, "+": 1}` would give a complexity of 2 for each use
Expand All @@ -340,10 +348,13 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
numbers for a complexity, and the total complexity of a tree
will be rounded to the nearest integer after computing.
Default is `None`.
complexity_of_constants : float
complexity_of_constants : int | float
Complexity of constants. Default is `1`.
complexity_of_variables : float
Complexity of variables. Default is `1`.
complexity_of_variables : int | float
Global complexity of variables. To set different complexities for
different variables, pass a list of complexities to the `fit` method
with keyword `complexity_of_variables`. You cannot use both.
Default is `1`.
parsimony : float
Multiplicative factor for how much to punish complexity.
Default is `0.0032`.
Expand Down Expand Up @@ -689,6 +700,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
n_features_in_: int
feature_names_in_: ArrayLike[str]
display_feature_names_in_: ArrayLike[str]
complexity_of_variables_: Union[int, float, List[Union[int, float]]]
X_units_: Union[ArrayLike[str], None]
y_units_: Union[str, ArrayLike[str], None]
nout_: int
Expand Down Expand Up @@ -720,7 +732,7 @@ def __init__(
loss_function: Optional[str] = None,
complexity_of_operators: Optional[Dict[str, Union[int, float]]] = None,
complexity_of_constants: Union[int, float] = 1,
complexity_of_variables: Union[int, float] = 1,
complexity_of_variables: Optional[Union[int, float]] = None,
parsimony: float = 0.0032,
dimensional_constraint_penalty: Optional[float] = None,
dimensionless_constants_only: bool = False,
Expand Down Expand Up @@ -1338,13 +1350,22 @@ def _validate_and_modify_params(self) -> _DynamicallySetParams:
return param_container

def _validate_and_set_fit_params(
self, X, y, Xresampled, weights, variable_names, X_units, y_units
self,
X,
y,
Xresampled,
weights,
variable_names,
complexity_of_variables,
X_units,
y_units,
) -> Tuple[
ndarray,
ndarray,
Optional[ndarray],
Optional[ndarray],
ArrayLike[str],
Union[int, float, List[Union[int, float]]],
Optional[ArrayLike[str]],
Optional[Union[str, ArrayLike[str]]],
]:
Expand All @@ -1369,6 +1390,8 @@ def _validate_and_set_fit_params(
for that particular element of y.
variable_names : ndarray of length n_features
Names of each variable in the training dataset, `X`.
complexity_of_variables : int | float | list[int | float]
Complexity of each variable in the training dataset, `X`.
X_units : list[str] of length n_features
Units of each variable in the training dataset, `X`.
y_units : str | list[str] of length n_out
Expand Down Expand Up @@ -1416,6 +1439,22 @@ def _validate_and_set_fit_params(
"Please use valid names instead."
)

if (
complexity_of_variables is not None
and self.complexity_of_variables is not None
):
raise ValueError(
"You cannot set `complexity_of_variables` at both `fit` and `__init__`. "
"Pass it at `__init__` to set it to global default, OR use `fit` to set it for "
"each variable individually."
)
elif complexity_of_variables is not None:
complexity_of_variables = complexity_of_variables
elif self.complexity_of_variables is not None:
complexity_of_variables = self.complexity_of_variables
else:
complexity_of_variables = 1

# Data validation and feature name fetching via sklearn
# This method sets the n_features_in_ attribute
if Xresampled is not None:
Expand Down Expand Up @@ -1446,10 +1485,20 @@ def _validate_and_set_fit_params(
else:
raise NotImplementedError("y shape not supported!")

self.complexity_of_variables_ = copy.deepcopy(complexity_of_variables)
self.X_units_ = copy.deepcopy(X_units)
self.y_units_ = copy.deepcopy(y_units)

return X, y, Xresampled, weights, variable_names, X_units, y_units
return (
X,
y,
Xresampled,
weights,
variable_names,
complexity_of_variables,
X_units,
y_units,
)

def _validate_data_X_y(self, X, y) -> Tuple[ndarray, ndarray]:
raw_out = self._validate_data(X=X, y=y, reset=True, multi_output=True) # type: ignore
Expand All @@ -1465,6 +1514,7 @@ def _pre_transform_training_data(
y: ndarray,
Xresampled: Union[ndarray, None],
variable_names: ArrayLike[str],
complexity_of_variables: Union[int, float, List[Union[int, float]]],
X_units: Union[ArrayLike[str], None],
y_units: Union[ArrayLike[str], str, None],
random_state: np.random.RandomState,
Expand All @@ -1487,6 +1537,8 @@ def _pre_transform_training_data(
variable_names : list[str]
Names of each variable in the training dataset, `X`.
Of length `n_features`.
complexity_of_variables : int | float | list[int | float]
Complexity of each variable in the training dataset, `X`.
X_units : list[str]
Units of each variable in the training dataset, `X`.
y_units : str | list[str]
Expand Down Expand Up @@ -1537,6 +1589,14 @@ def _pre_transform_training_data(
],
)

if isinstance(complexity_of_variables, list):
complexity_of_variables = [
complexity_of_variables[i]
for i in range(len(complexity_of_variables))
if selection_mask[i]
]
self.complexity_of_variables_ = copy.deepcopy(complexity_of_variables)

if X_units is not None:
X_units = cast(
ArrayLike[str],
Expand All @@ -1561,7 +1621,7 @@ def _pre_transform_training_data(
else:
X, y = denoise(X, y, Xresampled=Xresampled, random_state=random_state)

return X, y, variable_names, X_units, y_units
return X, y, variable_names, complexity_of_variables, X_units, y_units

def _run(
self,
Expand Down Expand Up @@ -1618,6 +1678,7 @@ def _run(

nested_constraints = self.nested_constraints
complexity_of_operators = self.complexity_of_operators
complexity_of_variables = self.complexity_of_variables_
cluster_manager = self.cluster_manager

# Start julia backend processes
Expand Down Expand Up @@ -1662,6 +1723,9 @@ def _run(
complexity_of_operators = jl.seval(complexity_of_operators_str)
# TODO: Refactor this into helper function

if isinstance(complexity_of_variables, list):
complexity_of_variables = jl_array(complexity_of_variables)

custom_loss = jl.seval(
str(self.elementwise_loss)
if self.elementwise_loss is not None
Expand Down Expand Up @@ -1720,7 +1784,7 @@ def _run(
una_constraints=jl_array(una_constraints),
complexity_of_operators=complexity_of_operators,
complexity_of_constants=self.complexity_of_constants,
complexity_of_variables=self.complexity_of_variables,
complexity_of_variables=complexity_of_variables,
nested_constraints=nested_constraints,
elementwise_loss=custom_loss,
loss_function=custom_full_objective,
Expand Down Expand Up @@ -1865,6 +1929,9 @@ def fit(
Xresampled=None,
weights=None,
variable_names: Optional[ArrayLike[str]] = None,
complexity_of_variables: Optional[
Union[int, float, List[Union[int, float]]]
] = None,
X_units: Optional[ArrayLike[str]] = None,
y_units: Optional[Union[str, ArrayLike[str]]] = None,
) -> "PySRRegressor":
Expand Down Expand Up @@ -1925,6 +1992,7 @@ def fit(
self.selection_mask_ = None
self.julia_state_stream_ = None
self.julia_options_stream_ = None
self.complexity_of_variables_ = None
self.X_units_ = None
self.y_units_ = None

Expand All @@ -1938,10 +2006,18 @@ def fit(
Xresampled,
weights,
variable_names,
complexity_of_variables,
X_units,
y_units,
) = self._validate_and_set_fit_params(
X, y, Xresampled, weights, variable_names, X_units, y_units
X,
y,
Xresampled,
weights,
variable_names,
complexity_of_variables,
X_units,
y_units,
)

if X.shape[0] > 10000 and not self.batching:
Expand All @@ -1959,8 +2035,17 @@ def fit(
seed = cast(int, random_state.randint(0, 2**31 - 1)) # For julia random

# Pre transformations (feature selection and denoising)
X, y, variable_names, X_units, y_units = self._pre_transform_training_data(
X, y, Xresampled, variable_names, X_units, y_units, random_state
X, y, variable_names, complexity_of_variables, X_units, y_units = (
self._pre_transform_training_data(
X,
y,
Xresampled,
variable_names,
complexity_of_variables,
X_units,
y_units,
random_state,
)
)

# Warn about large feature counts (still warn if feature count is large
Expand All @@ -1987,6 +2072,7 @@ def fit(
X,
use_custom_variable_names,
variable_names,
complexity_of_variables,
weights,
y,
X_units,
Expand Down

0 comments on commit 3a3b168

Please sign in to comment.