'list' object has no attribute 'lower' TfidfVectorizer
I have a dataframe with two text columns and I converted them to a list. I seperated the train and test data as well. But while making a base model TfidfVectorizer throws me an error of 'list' object has no attribute 'lower'
Here is the code
X['ItemDescription']= X['ItemDescription'].str.lower()
X['DiagnosisOne'] = X['DiagnosisOne'].str.lower()
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)
# Convert abstract text lines into lists
train_items = X_train.reset_index().values.tolist()
test_items = X_test.reset_index().values.tolist()
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(y_train.to_numpy())
test_labels_encoded = label_encoder.transform(y_test.to_numpy())
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
# Create a pipeline
model_0 = Pipeline([
(tf-idf, TfidfVectorizer()),
(clf, MultinomialNB())
])
# Fit the pipeline to the training data
model_0.fit(X=train_items,
y=train_labels_encoded);
Error
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
/tmp/ipykernel_789/2404586028.py in module
11 # Fit the pipeline to the training data
12 model_0.fit(X=train_items,
--- 13 y=train_labels_encoded);
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
388
389 fit_params_steps = self._check_fit_params(**fit_params)
-- 390 Xt = self._fit(X, y, **fit_params_steps)
391 with _print_elapsed_time(Pipeline, self._log_message(len(self.steps) - 1)):
392 if self._final_estimator != passthrough:
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
353 message_clsname=Pipeline,
354 message=self._log_message(step_idx),
-- 355 **fit_params_steps[name],
356 )
357 # Replace the transformer of the step with the fitted
/opt/conda/lib/python3.7/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
347
348 def __call__(self, *args, **kwargs):
-- 349 return self.func(*args, **kwargs)
350
351 def call_and_shelve(self, *args, **kwargs):
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
891 with _print_elapsed_time(message_clsname, message):
892 if hasattr(transformer, fit_transform):
-- 893 res = transformer.fit_transform(X, y, **fit_params)
894 else:
895 res = transformer.fit(X, y, **fit_params).transform(X)
/opt/conda/lib/python3.7/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
2075
2076 self._check_params()
- 2077 X = super().fit_transform(raw_documents)
2078 self._tfidf.fit(X)
2079 # X is already a transformed view of raw_documents so
/opt/conda/lib/python3.7/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
1328 break
1329
- 1330 vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
1331
1332 if self.binary:
/opt/conda/lib/python3.7/site-packages/sklearn/feature_extraction/text.py in _count_vocab(self, raw_documents, fixed_vocab)
1199 for doc in raw_documents:
1200 feature_counter = {}
- 1201 for feature in analyze(doc):
1202 try:
1203 feature_idx = vocabulary[feature]
/opt/conda/lib/python3.7/site-packages/sklearn/feature_extraction/text.py in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
111 else:
112 if preprocessor is not None:
-- 113 doc = preprocessor(doc)
114 if tokenizer is not None:
115 doc = tokenizer(doc)
/opt/conda/lib/python3.7/site-packages/sklearn/feature_extraction/text.py in _preprocess(doc, accent_function, lower)
69
70 if lower:
--- 71 doc = doc.lower()
72 if accent_function is not None:
73 doc = accent_function(doc)
AttributeError: 'list' object has no attribute 'lower'
Topic tfidf multiclass-classification nlp
Category Data Science