unable to pass X_train and y_train in my regressor variable. i got a ValueError

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = pd.read_csv('housing.csv')
 
data.drop('ocean_proximity', axis=1, inplace = True)

data.head()
longitude   latitude    housing_median_age  total_rooms total_bedrooms  population  households  median_income   median_house_value
0   -122.23 37.88   41.0    880.0   129.0   322.0   126.0   8.3252  452600.0
1   -122.22 37.86   21.0    7099.0  1106.0  2401.0  1138.0  8.3014  358500.0
2   -122.24 37.85   52.0    1467.0  190.0   496.0   177.0   7.2574  352100.0
3   -122.25 37.85   52.0    1274.0  235.0   558.0   219.0   5.6431  341300.0
4   -122.25 37.85   52.0    1627.0  280.0   565.0   259.0   3.8462  342200.0
X = data.iloc[:, 6:-1].values
y= data.iloc[:, -1].values

from sklearn.model_selection import train_test_split
X_train, y_train, X_test, y_test = train_test_split (X,y, test_size = 0.2, random_state = 0)

print(X_train)
[[ 65.       4.2386]
 [447.       4.3898]
 [368.       3.9333]
 ...
 [393.       3.1977]
 [468.       5.6315]
 [298.       1.3882]]
print(y_train)
[[371.       4.1518]
 [429.       5.7796]
 [534.       4.3487]
 ...
 [326.       3.2027]
 [374.       6.1436]
 [406.       3.3326]]
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(np.array(X_train).reshape(-1, 1), y_train)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
ipython-input-345-0edbf6e4cc5c in module
---- 1 regressor.fit(np.array(X_train).reshape(-1, 1), y_train)

~\anaconda3\lib\site-packages\sklearn\linear_model\_base.py in fit(self, X, y, sample_weight)
    516         accept_sparse = False if self.positive else ['csr', 'csc', 'coo']
    517 
-- 518         X, y = self._validate_data(X, y, accept_sparse=accept_sparse,
    519                                    y_numeric=True, multi_output=True)
    520 

~\anaconda3\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
    431                 y = check_array(y, **check_y_params)
    432             else:
-- 433                 X, y = check_X_y(X, y, **check_params)
    434             out = X, y
    435 

~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
     61             extra_args = len(args) - len(all_args)
     62             if extra_args = 0:
--- 63                 return f(*args, **kwargs)
     64 
     65             # extra_args  0

~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
    829         y = y.astype(np.float64)
    830 
-- 831     check_consistent_length(X, y)
    832 
    833     return X, y

~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
    260     uniques = np.unique(lengths)
    261     if len(uniques)  1:
-- 262         raise ValueError(Found input variables with inconsistent numbers of
    263                           samples: %r % [int(l) for l in lengths])
    264 

ValueError: Found input variables with inconsistent numbers of samples: [33024, 4128]
```

Topic numpy linear-regression pandas python

Category Data Science


You are misusing the returned tuple from train_test_split

It returns first the two X matrices and then the two y matrices.

like so:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

There's something weird about the dimensions of X_train and y_train. They automatically have the same number of rows after train_test_split, but for some reason you do reshape(-1,1) on X_train. This changes the number of rows for X_train, so of course it doesn't have the same number of rows as y_train, hence the error.

Normally you shouldn't have to reshape the features, it's normal to have several features by instance.

About

Geeks Mental is a community that publishes articles and tutorials about Web, Android, Data Science, new techniques and Linux security.