recognizing the correct word & "Set type is unordered"-error in python-pandas

My Data Set (CSV):

CL1,CL2,CL3 
Hello Worrld,Hello ! World,Snack
Hello % World,Hello World,Vol 8.5% Alc
Hello World,Good! Hello,Hello World
Good Morning,Airplane,Good Morning
JK^KJ,Good Morning,Talueas

My Goal:

1- I would like to search and find the similar values between all columns (CL1-CL3) and sort in a new column (SIM).

2- I would like to find the non-similar values between columns and sort in another column (NON-SIM).

What I Would Like:

Actually, I would like to use it in supervised learning for clustering. This is only a sample of my data set. In reality, it is 2e9x3x9. I will that my algorithm recognize correctly the word Hello World if it sees Hello Worrld or Hello % World

What I did:

import numpy as np 
import pandas as pd#
from functools import reduce
df=pd.read_csv(....)

#first get each column's unique values as a set:
cols = df.agg(set)
print(cols)

#combine these using set operations to get your desired res
sim = reduce(set.intersection, cols)
non_sim = reduce(set.union, cols) - sim

Problem with cols = df.agg(set)

    TypeError                                 Traceback (most recent call last)
ipython-input-4-d51c2e7d293c in module
---- 1 cols = df.agg(set)
      2 print(cols)

~\anaconda3\lib\site-packages\pandas\core\frame.py in aggregate(self, func, axis, *args, **kwargs)
   7581             raise exc from err
   7582         if result is None:
- 7583             return self.apply(func, axis=axis, args=args, **kwargs)
   7584 
   7585         if relabeling:

~\anaconda3\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, raw, result_type, args, **kwds)
   7763             kwds=kwds,
   7764         )
- 7765         return op.get_result()
   7766 
   7767     def applymap(self, func, na_action: Optional[str] = None) - DataFrame:

~\anaconda3\lib\site-packages\pandas\core\apply.py in get_result(self)
    183             return self.apply_raw()
    184 
-- 185         return self.apply_standard()
    186 
    187     def apply_empty_result(self):

~\anaconda3\lib\site-packages\pandas\core\apply.py in apply_standard(self)
    277 
    278         # wrap results
-- 279         return self.wrap_results(results, res_index)
    280 
    281     def apply_series_generator(self) - Tuple[ResType, Index]:

~\anaconda3\lib\site-packages\pandas\core\apply.py in wrap_results(self, results, res_index)
    301         # see if we can infer the results
    302         if len(results)  0 and 0 in results and is_sequence(results[0]):
-- 303             return self.wrap_results_for_axis(results, res_index)
    304 
    305         # dict of scalars

~\anaconda3\lib\site-packages\pandas\core\apply.py in wrap_results_for_axis(self, results, res_index)
    359 
    360         try:
-- 361             result = self.obj._constructor(data=results)
    362         except ValueError as err:
    363             if arrays must all be same length in str(err):

~\anaconda3\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
    527 
    528         elif isinstance(data, dict):
-- 529             mgr = init_dict(data, index, columns, dtype=dtype)
    530         elif isinstance(data, ma.MaskedArray):
    531             import numpy.ma.mrecords as mrecords

~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in init_dict(data, index, columns, dtype)
    285             arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
    286         ]
-- 287     return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
    288 
    289 

~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in arrays_to_mgr(arrays, arr_names, index, columns, dtype, verify_integrity)
     83 
     84         # don't force copy because getting jammed in an ndarray anyway
--- 85         arrays = _homogenize(arrays, index, dtype)
     86 
     87         columns = ensure_index(columns)

~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in _homogenize(data, index, dtype)
    353                     val = dict(val)
    354                 val = lib.fast_multiget(val, oindex._values, default=np.nan)
-- 355             val = sanitize_array(
    356                 val, index, dtype=dtype, copy=False, raise_cast_failure=False
    357             )

~\anaconda3\lib\site-packages\pandas\core\construction.py in sanitize_array(data, index, dtype, copy, raise_cast_failure)
    474         if isinstance(data, set):
    475             # Raise only for unordered sets, e.g., not for dict_keys
-- 476             raise TypeError(Set type is unordered)
    477         data = list(data)
    478 

TypeError: Set type is unordered

Topic text-classification error-handling pandas data-cleaning machine-learning

Category Data Science


See also this stackoverflow answer, if you just want the unique values you can use pandas.Series.unique() or pandas.DataFrame.drop_duplicates(). If you need the python set object you can use set(df['colname']).

About

Geeks Mental is a community that publishes articles and tutorials about Web, Android, Data Science, new techniques and Linux security.