Model works perfectly but GridSearch causes error
While working on a project I have come across a weird error, where fitting my model works perfectly but when I apply gridsearch
it gives me an error.
The code puts all the necessary objects created and uses them in the pipeline.
from sklearn.base import BaseEstimator, TransformerMixin
class DataPreprocess(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, X, y=None):
self.X_m = X.merge(y, on= ['year', 'weekofyear'])
return self
def transform(self, X):
dt = pd.to_datetime(self.X_m["week_start_date"], format="%Y-%m-%d")
unix = []
for i in dt:
unix.append(time.mktime(i.timetuple()))
X_t = (self.X_m).reset_index().assign(date = unix).set_index(['date'])
return X_t
class FourierComponents(BaseEstimator, TransformerMixin):
"""creat features based on sin(2*pi*f*t) and cos(2*pi*f*t)"""
def __init__(self, n=10):
self.n = n
def fit(self, X, y= None):
self.labels = X['total_cases']
self.Y_t = fft(self.labels - (self.labels).mean())
self.Y_t = self.Y_t[:len(self.labels)//2]
self.ind_max = np.abs(self.Y_t).argsort()
self.t_span = len(self.labels)
self.f = np.linspace(0, len(self.Y_t), len(self.Y_t)) / self.t_span
self.f_ind = self.f[self.ind_max]
self.ind = pd.RangeIndex(start = 1, stop=(len(X.index.get_level_values('date')) +1)).values.reshape(-1, 1)
return self
def transform(self, X):
Xt = np.zeros((X.shape[0], 2*len(self.f_ind[-self.n:])))
for i, f in enumerate(self.f_ind[-self.n:]):
Xt[:, 2*i] = np.cos(2*np.pi*f*self.ind).reshape(-1)
Xt[:, 2*i + 1] = np.sin(2*np.pi*f*self.ind).reshape(-1)
return Xt
Unixdata = DataPreprocess()
fourier = FourierComponents()
model = Pipeline([
('indices', Unixdata),
('fourier', fourier),
('scalar', StandardScaler()),
('regressor', Ridge())
])
param_grid = {'fourier__n' : list(range(3,5)),
'regressor__alpha' : np.logspace(1, 4, 20)}
grid_search = GridSearchCV(model, param_grid, cv = 5, verbose = 1, scoring='neg_mean_absolute_error')
grid_search.fit(sj_train_features, sj_train_labels)
fitting the grid_search
here gives me this error:
Fitting 5 folds for each of 40 candidates, totalling 200 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-167-cfce20172a59> in <module>
----> 1 grid_search.fit(sj_train_features, sj_train_labels)
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
734 return results
735
--> 736 self._run_search(evaluate_candidates)
737
738 # For multi-metric evaluation, store the best_index_, best_params_ and
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1186 def _run_search(self, evaluate_candidates):
1187 """Search all candidates in param_grid"""
-> 1188 evaluate_candidates(ParameterGrid(self.param_grid))
1189
1190
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
706 n_splits, n_candidates, n_candidates * n_splits))
707
--> 708 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
709 X, y,
710 train=train, test=test,
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1027 # remaining jobs.
1028 self._iterating = False
-> 1029 if self.dispatch_one_batch(iterator):
1030 self._iterating = self._original_iterator is not None
1031
~\anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
845 return False
846 else:
--> 847 self._dispatch(tasks)
848 return True
849
~\anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
763 with self._lock:
764 job_idx = len(self._jobs)
--> 765 job = self._backend.apply_async(batch, callback=cb)
766 # A job can complete so quickly than its callback is
767 # called before we get here, causing self._jobs to
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
250 # change the default number of processes to -1
251 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252 return [func(*args, **kwargs)
253 for func, args, kwargs in self.items]
254
~\anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
250 # change the default number of processes to -1
251 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252 return [func(*args, **kwargs)
253 for func, args, kwargs in self.items]
254
~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
558 else:
559 fit_time = time.time() - start_time
--> 560 test_scores = _score(estimator, X_test, y_test, scorer)
561 score_time = time.time() - start_time - fit_time
562 if return_train_score:
~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer)
605 scores = scorer(estimator, X_test)
606 else:
--> 607 scores = scorer(estimator, X_test, y_test)
608
609 error_msg = ("scoring must return a number, got %s (%s) "
~\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in __call__(self, estimator, *args, **kwargs)
85 for name, scorer in self._scorers.items():
86 if isinstance(scorer, _BaseScorer):
---> 87 score = scorer._score(cached_call, estimator,
88 *args, **kwargs)
89 else:
~\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in _score(self, method_caller, estimator, X, y_true, sample_weight)
210 **self._kwargs)
211 else:
--> 212 return self._sign * self._score_func(y_true, y_pred,
213 **self._kwargs)
214
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\anaconda3\lib\site-packages\sklearn\metrics\_regression.py in mean_absolute_error(y_true, y_pred, sample_weight, multioutput)
176 0.85...
177 """
--> 178 y_type, y_true, y_pred, multioutput = _check_reg_targets(
179 y_true, y_pred, multioutput)
180 check_consistent_length(y_true, y_pred, sample_weight)
~\anaconda3\lib\site-packages\sklearn\metrics\_regression.py in _check_reg_targets(y_true, y_pred, multioutput, dtype)
82
83 """
---> 84 check_consistent_length(y_true, y_pred)
85 y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
86 y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
254 uniques = np.unique(lengths)
255 if len(uniques) > 1:
--> 256 raise ValueError("Found input variables with inconsistent numbers of"
257 " samples: %r" % [int(l) for l in lengths])
258
ValueError: Found input variables with inconsistent numbers of samples: [188, 748]
but
model.fit(sj_train_features, sj_train_labels)
fits perfectly.
Now I am wondering why and where is the mistake in the code? Can anyone point me in the right direction?
A small example of my data looks like this:
week_start_date ndvi_ne station_precip ...
year weekofyear
1990 18 1990-04-30 0.1226 12.42
similarly the sj_train_labels
have two indices of year
and weekofyear
with one column of total_cases
Comments
Post a Comment