You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Hello, there seems to be a bug with the PowerTransform implementation, which I run into using 15 datasets out of 50 when evaluate CARTE over datasets from OpenML and Kaggle. I understand that this is a known issue.
I add here a simple reproducible example for one of the problematic datasets. The code is simplified to the highest extent possible (e.g. no train-test split).
import openml
from huggingface_hub import hf_hub_download
from carte_ai import Table2GraphTransformer
model_path = hf_hub_download(repo_id="hi-paris/fastText", filename="cc.en.300.bin")
preprocessor = Table2GraphTransformer(fasttext_model_path=model_path)
dataset = openml.datasets.get_dataset(46667)
x, y, _, _ = dataset.get_data(target=dataset.default_target_attribute)
x = preprocessor.fit_transform(x, y=y)
In [4]: x = preprocessor.fit_transform(x, y=y)
/data/home/alan.arazi/miniconda3/envs/tabular/lib/python3.11/site-packages/sklearn/preprocessing/_data.py:3438: RuntimeWarning: overflow encountered in power
out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / lmbda
---------------------------------------------------------------------------
BracketError Traceback (most recent call last)
Cell In[4], line 1
----> 1 x = preprocessor.fit_transform(x, y=y)
File ~/miniconda3/envs/tabular/lib/python3.11/site-packages/sklearn/utils/_set_output.py:316, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
314 @wraps(f)
315 def wrapped(self, X, *args, **kwargs):
--> 316 data_to_wrap = f(self, X, *args, **kwargs)
317 if isinstance(data_to_wrap, tuple):
318 # only wrap the first output for cross decomposition
319 return_tuple = (
320 _wrap_data_with_container(method, data_to_wrap[0], X, self),
321 *data_to_wrap[1:],
322 )
File ~/miniconda3/envs/tabular/lib/python3.11/site-packages/sklearn/base.py:1101, in TransformerMixin.fit_transform(self, X, y, **fit_params)
1098 return self.fit(X, **fit_params).transform(X)
1099 else:
1100 # fit method of arity 2 (supervised transformation)
-> 1101 return self.fit(X, y, **fit_params).transform(X)
File ~/miniconda3/envs/tabular/lib/python3.11/site-packages/carte_ai/src/carte_table_to_graph.py:148, in Table2GraphTransformer.fit(self, X, y)
146 num_cols_exist = [col for col in self.num_col_names if col in X.columns]
147 if num_cols_exist:
--> 148 self.num_transformer_.fit(X[num_cols_exist])
149 #print(f"Numerical columns fitted for normalization: {num_cols_exist}")
151 self.is_fitted_ = True
File ~/miniconda3/envs/tabular/lib/python3.11/site-packages/sklearn/base.py:1473, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
1466 estimator._validate_params()
1468 with config_context(
1469 skip_parameter_validation=(
1470 prefer_skip_nested_validation or global_skip_validation
1471 )
1472 ):
-> 1473 return fit_method(estimator, *args, **kwargs)
File ~/miniconda3/envs/tabular/lib/python3.11/site-packages/sklearn/preprocessing/_data.py:3251, in PowerTransformer.fit(self, X, y)
3231 @_fit_context(prefer_skip_nested_validation=True)
3232 def fit(self, X, y=None):
3233 """Estimate the optimal parameter lambda for each feature.
3234
3235 The optimal lambda parameter for minimizing skewness is estimated on
(...)
3249 Fitted transformer.
3250 """
-> 3251 self._fit(X, y=y, force_transform=False)
3252 return self
File ~/miniconda3/envs/tabular/lib/python3.11/site-packages/sklearn/preprocessing/_data.py:3304, in PowerTransformer._fit(self, X, y, force_transform)
3301 self.lambdas_[i] = 1.0
3302 continue
-> 3304 self.lambdas_[i] = optim_function(col)
3306 if self.standardize or force_transform:
3307 X[:, i] = transform_function(X[:, i], self.lambdas_[i])
File ~/miniconda3/envs/tabular/lib/python3.11/site-packages/sklearn/preprocessing/_data.py:3493, in PowerTransformer._yeo_johnson_optimize(self, x)
3491 x = x[~np.isnan(x)]
3492 # choosing bracket -2, 2 like for boxcox
-> 3493 return optimize.brent(_neg_log_likelihood, brack=(-2, 2))
File ~/miniconda3/envs/tabular/lib/python3.11/site-packages/scipy/optimize/_optimize.py:2655, in brent(func, args, brack, tol, full_output, maxiter)
2583 """
2584 Given a function of one variable and a possible bracket, return
2585 a local minimizer of the function isolated to a fractional precision
(...)
2651
2652 """
2653 options = {'xtol': tol,
2654 'maxiter': maxiter}
-> 2655 res = _minimize_scalar_brent(func, brack, args, **options)
2656 if full_output:
2657 return res['x'], res['fun'], res['nit'], res['nfev']
File ~/miniconda3/envs/tabular/lib/python3.11/site-packages/scipy/optimize/_optimize.py:2697, in _minimize_scalar_brent(func, brack, args, xtol, maxiter, disp, **unknown_options)
2694 brent = Brent(func=func, args=args, tol=tol,
2695 full_output=True, maxiter=maxiter, disp=disp)
2696 brent.set_bracket(brack)
-> 2697 brent.optimize()
2698 x, fval, nit, nfev = brent.get_result(full_output=True)
2700 success = nit < maxiter and not (np.isnan(x) or np.isnan(fval))
File ~/miniconda3/envs/tabular/lib/python3.11/site-packages/scipy/optimize/_optimize.py:2462, in Brent.optimize(self)
2459 def optimize(self):
2460 # set up for optimization
2461 func = self.func
-> 2462 xa, xb, xc, fa, fb, fc, funcalls = self.get_bracket_info()
2463 _mintol = self._mintol
2464 _cg = self._cg
File ~/miniconda3/envs/tabular/lib/python3.11/site-packages/scipy/optimize/_optimize.py:2431, in Brent.get_bracket_info(self)
2429 xa, xb, xc, fa, fb, fc, funcalls = bracket(func, args=args)
2430 elif len(brack) == 2:
-> 2431 xa, xb, xc, fa, fb, fc, funcalls = bracket(func, xa=brack[0],
2432 xb=brack[1], args=args)
2433 elif len(brack) == 3:
2434 xa, xb, xc = brack
File ~/miniconda3/envs/tabular/lib/python3.11/site-packages/scipy/optimize/_optimize.py:3070, in bracket(func, xa, xb, args, grow_limit, maxiter)
3068 e = BracketError(msg)
3069 e.data = (xa, xb, xc, fa, fb, fc, funcalls)
-> 3070 raise e
3072 return xa, xb, xc, fa, fb, fc, funcalls
BracketError: The algorithm terminated without finding a valid bracket. Consider trying different initial points.
The text was updated successfully, but these errors were encountered:
Hello, there seems to be a bug with the PowerTransform implementation, which I run into using 15 datasets out of 50 when evaluate CARTE over datasets from OpenML and Kaggle. I understand that this is a known issue.
I add here a simple reproducible example for one of the problematic datasets. The code is simplified to the highest extent possible (e.g. no train-test split).
The text was updated successfully, but these errors were encountered: