'list' object has no attribute 'lower' TfidfVectorizer

I have a dataframe with two text columns and I converted them to a list. I seperated the train and test data as well. But while making a base model TfidfVectorizer throws me an error of 'list' object has no attribute 'lower'

Here is the code

    X['ItemDescription']= X['ItemDescription'].str.lower()
    X['DiagnosisOne'] = X['DiagnosisOne'].str.lower()
    
    from sklearn.model_selection import train_test_split
    X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

# Convert abstract text lines into lists 
train_items = X_train.reset_index().values.tolist()
test_items = X_test.reset_index().values.tolist()

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(y_train.to_numpy())
test_labels_encoded = label_encoder.transform(y_test.to_numpy())

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create a pipeline
model_0 = Pipeline([
  (tf-idf, TfidfVectorizer()),
  (clf, MultinomialNB())
])

# Fit the pipeline to the training data
model_0.fit(X=train_items, 
            y=train_labels_encoded);

Error

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
/tmp/ipykernel_789/2404586028.py in module
     11 # Fit the pipeline to the training data
     12 model_0.fit(X=train_items, 
--- 13             y=train_labels_encoded);

/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    388         
    389         fit_params_steps = self._check_fit_params(**fit_params)
-- 390         Xt = self._fit(X, y, **fit_params_steps)
    391         with _print_elapsed_time(Pipeline, self._log_message(len(self.steps) - 1)):
    392             if self._final_estimator != passthrough:

/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
    353                 message_clsname=Pipeline,
    354                 message=self._log_message(step_idx),
-- 355                 **fit_params_steps[name],
    356             )
    357             # Replace the transformer of the step with the fitted

/opt/conda/lib/python3.7/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
    347 
    348     def __call__(self, *args, **kwargs):
-- 349         return self.func(*args, **kwargs)
    350 
    351     def call_and_shelve(self, *args, **kwargs):

/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    891     with _print_elapsed_time(message_clsname, message):
    892         if hasattr(transformer, fit_transform):
-- 893             res = transformer.fit_transform(X, y, **fit_params)
    894         else:
    895             res = transformer.fit(X, y, **fit_params).transform(X)

/opt/conda/lib/python3.7/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
   2075         
   2076         self._check_params()
- 2077         X = super().fit_transform(raw_documents)
   2078         self._tfidf.fit(X)
   2079         # X is already a transformed view of raw_documents so

/opt/conda/lib/python3.7/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
   1328                     break
   1329 
- 1330         vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
   1331 
   1332         if self.binary:

/opt/conda/lib/python3.7/site-packages/sklearn/feature_extraction/text.py in _count_vocab(self, raw_documents, fixed_vocab)
   1199         for doc in raw_documents:
   1200             feature_counter = {}
- 1201             for feature in analyze(doc):
   1202                 try:
   1203                     feature_idx = vocabulary[feature]

/opt/conda/lib/python3.7/site-packages/sklearn/feature_extraction/text.py in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
    111     else:
    112         if preprocessor is not None:
-- 113             doc = preprocessor(doc)
    114         if tokenizer is not None:
    115             doc = tokenizer(doc)

/opt/conda/lib/python3.7/site-packages/sklearn/feature_extraction/text.py in _preprocess(doc, accent_function, lower)
     69     
     70     if lower:
--- 71         doc = doc.lower()
     72     if accent_function is not None:
     73         doc = accent_function(doc)

AttributeError: 'list' object has no attribute 'lower'

Topic tfidf multiclass-classification nlp

Category Data Science


If the 'text' is in a list format, the lower function does not work for a list. It works for a string as an element in a list.

So you should input

[x.lower() for x in text]

as an argument for the function.

About

Geeks Mental is a community that publishes articles and tutorials about Web, Android, Data Science, new techniques and Linux security.