2022-04-14

Model works perfectly but GridSearch causes error

While working on a project I have come across a weird error, where fitting my model works perfectly but when I apply gridsearch it gives me an error.

The code puts all the necessary objects created and uses them in the pipeline.

from sklearn.base import BaseEstimator, TransformerMixin
class DataPreprocess(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        self.X_m = X.merge(y, on= ['year', 'weekofyear'])
        
        return self
    
    def transform(self, X):
        dt = pd.to_datetime(self.X_m["week_start_date"], format="%Y-%m-%d")
        unix = []
        for i in dt:
            unix.append(time.mktime(i.timetuple()))
        X_t = (self.X_m).reset_index().assign(date = unix).set_index(['date'])
        
        return X_t 



class FourierComponents(BaseEstimator, TransformerMixin):
    """creat features based on sin(2*pi*f*t) and cos(2*pi*f*t)"""
    def __init__(self, n=10):
        self.n = n
        
    def fit(self, X, y= None):
        self.labels = X['total_cases']
        
        self.Y_t = fft(self.labels - (self.labels).mean())
        self.Y_t = self.Y_t[:len(self.labels)//2]

        
        self.ind_max = np.abs(self.Y_t).argsort()
        
        self.t_span = len(self.labels)

        self.f = np.linspace(0, len(self.Y_t), len(self.Y_t)) / self.t_span
        
        self.f_ind = self.f[self.ind_max]
        
        self.ind = pd.RangeIndex(start = 1, stop=(len(X.index.get_level_values('date')) +1)).values.reshape(-1, 1)
    
        return self
    
    def transform(self, X):
        
        Xt = np.zeros((X.shape[0], 2*len(self.f_ind[-self.n:])))
        
        for i, f in enumerate(self.f_ind[-self.n:]):
            Xt[:, 2*i]     = np.cos(2*np.pi*f*self.ind).reshape(-1)
            Xt[:, 2*i + 1] = np.sin(2*np.pi*f*self.ind).reshape(-1)
        
        return Xt

Unixdata = DataPreprocess()
fourier = FourierComponents()

model = Pipeline([
    ('indices', Unixdata),
    ('fourier', fourier),
    ('scalar', StandardScaler()),
    ('regressor', Ridge())
])

param_grid = {'fourier__n' : list(range(3,5)),
              'regressor__alpha' : np.logspace(1, 4, 20)}

grid_search = GridSearchCV(model, param_grid, cv = 5, verbose = 1, scoring='neg_mean_absolute_error')

grid_search.fit(sj_train_features, sj_train_labels)

fitting the grid_search here gives me this error:

    Fitting 5 folds for each of 40 candidates, totalling 200 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-167-cfce20172a59> in <module>
----> 1 grid_search.fit(sj_train_features, sj_train_labels)

~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
     71                           FutureWarning)
     72         kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73         return f(**kwargs)
     74     return inner_f
     75 

~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
    734                 return results
    735 
--> 736             self._run_search(evaluate_candidates)
    737 
    738         # For multi-metric evaluation, store the best_index_, best_params_ and

~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
   1186     def _run_search(self, evaluate_candidates):
   1187         """Search all candidates in param_grid"""
-> 1188         evaluate_candidates(ParameterGrid(self.param_grid))
   1189 
   1190 

~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
    706                               n_splits, n_candidates, n_candidates * n_splits))
    707 
--> 708                 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
    709                                                        X, y,
    710                                                        train=train, test=test,

~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
   1027             # remaining jobs.
   1028             self._iterating = False
-> 1029             if self.dispatch_one_batch(iterator):
   1030                 self._iterating = self._original_iterator is not None
   1031 

~\anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
    845                 return False
    846             else:
--> 847                 self._dispatch(tasks)
    848                 return True
    849 

~\anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
    763         with self._lock:
    764             job_idx = len(self._jobs)
--> 765             job = self._backend.apply_async(batch, callback=cb)
    766             # A job can complete so quickly than its callback is
    767             # called before we get here, causing self._jobs to

~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
    206     def apply_async(self, func, callback=None):
    207         """Schedule a func to be run"""
--> 208         result = ImmediateResult(func)
    209         if callback:
    210             callback(result)

~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
    570         # Don't delay the application, to avoid keeping the input
    571         # arguments in memory
--> 572         self.results = batch()
    573 
    574     def get(self):

~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
    250         # change the default number of processes to -1
    251         with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252             return [func(*args, **kwargs)
    253                     for func, args, kwargs in self.items]
    254 

~\anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
    250         # change the default number of processes to -1
    251         with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252             return [func(*args, **kwargs)
    253                     for func, args, kwargs in self.items]
    254 

~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
    558     else:
    559         fit_time = time.time() - start_time
--> 560         test_scores = _score(estimator, X_test, y_test, scorer)
    561         score_time = time.time() - start_time - fit_time
    562         if return_train_score:

~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer)
    605         scores = scorer(estimator, X_test)
    606     else:
--> 607         scores = scorer(estimator, X_test, y_test)
    608 
    609     error_msg = ("scoring must return a number, got %s (%s) "

~\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in __call__(self, estimator, *args, **kwargs)
     85         for name, scorer in self._scorers.items():
     86             if isinstance(scorer, _BaseScorer):
---> 87                 score = scorer._score(cached_call, estimator,
     88                                       *args, **kwargs)
     89             else:

~\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in _score(self, method_caller, estimator, X, y_true, sample_weight)
    210                                                  **self._kwargs)
    211         else:
--> 212             return self._sign * self._score_func(y_true, y_pred,
    213                                                  **self._kwargs)
    214 

~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
     71                           FutureWarning)
     72         kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73         return f(**kwargs)
     74     return inner_f
     75 

~\anaconda3\lib\site-packages\sklearn\metrics\_regression.py in mean_absolute_error(y_true, y_pred, sample_weight, multioutput)
    176     0.85...
    177     """
--> 178     y_type, y_true, y_pred, multioutput = _check_reg_targets(
    179         y_true, y_pred, multioutput)
    180     check_consistent_length(y_true, y_pred, sample_weight)

~\anaconda3\lib\site-packages\sklearn\metrics\_regression.py in _check_reg_targets(y_true, y_pred, multioutput, dtype)
     82 
     83     """
---> 84     check_consistent_length(y_true, y_pred)
     85     y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
     86     y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)

~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
    254     uniques = np.unique(lengths)
    255     if len(uniques) > 1:
--> 256         raise ValueError("Found input variables with inconsistent numbers of"
    257                          " samples: %r" % [int(l) for l in lengths])
    258 

ValueError: Found input variables with inconsistent numbers of samples: [188, 748]

but

model.fit(sj_train_features, sj_train_labels)

fits perfectly.

Now I am wondering why and where is the mistake in the code? Can anyone point me in the right direction?

A small example of my data looks like this:

                   week_start_date  ndvi_ne station_precip ...
year   weekofyear 
1990     18         1990-04-30       0.1226    12.42

similarly the sj_train_labels have two indices of year and weekofyear with one column of total_cases



No comments:

Post a Comment