recognizing the correct word & "Set type is unordered"-error in python-pandas
My Data Set (CSV):
CL1,CL2,CL3
Hello Worrld,Hello ! World,Snack
Hello % World,Hello World,Vol 8.5% Alc
Hello World,Good! Hello,Hello World
Good Morning,Airplane,Good Morning
JK^KJ,Good Morning,Talueas
My Goal:
1- I would like to search and find the similar values between all columns (CL1-CL3) and sort in a new column (SIM).
2- I would like to find the non-similar values between columns and sort in another column (NON-SIM).
What I Would Like:
Actually, I would like to use it in supervised learning for clustering. This is only a sample of my data set. In reality, it is 2e9x3x9. I will that my algorithm recognize correctly the word Hello World if it sees Hello Worrld or Hello % World
What I did:
import numpy as np
import pandas as pd#
from functools import reduce
df=pd.read_csv(....)
#first get each column's unique values as a set:
cols = df.agg(set)
print(cols)
#combine these using set operations to get your desired res
sim = reduce(set.intersection, cols)
non_sim = reduce(set.union, cols) - sim
Problem with cols = df.agg(set)
TypeError Traceback (most recent call last)
ipython-input-4-d51c2e7d293c in module
---- 1 cols = df.agg(set)
2 print(cols)
~\anaconda3\lib\site-packages\pandas\core\frame.py in aggregate(self, func, axis, *args, **kwargs)
7581 raise exc from err
7582 if result is None:
- 7583 return self.apply(func, axis=axis, args=args, **kwargs)
7584
7585 if relabeling:
~\anaconda3\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, raw, result_type, args, **kwds)
7763 kwds=kwds,
7764 )
- 7765 return op.get_result()
7766
7767 def applymap(self, func, na_action: Optional[str] = None) - DataFrame:
~\anaconda3\lib\site-packages\pandas\core\apply.py in get_result(self)
183 return self.apply_raw()
184
-- 185 return self.apply_standard()
186
187 def apply_empty_result(self):
~\anaconda3\lib\site-packages\pandas\core\apply.py in apply_standard(self)
277
278 # wrap results
-- 279 return self.wrap_results(results, res_index)
280
281 def apply_series_generator(self) - Tuple[ResType, Index]:
~\anaconda3\lib\site-packages\pandas\core\apply.py in wrap_results(self, results, res_index)
301 # see if we can infer the results
302 if len(results) 0 and 0 in results and is_sequence(results[0]):
-- 303 return self.wrap_results_for_axis(results, res_index)
304
305 # dict of scalars
~\anaconda3\lib\site-packages\pandas\core\apply.py in wrap_results_for_axis(self, results, res_index)
359
360 try:
-- 361 result = self.obj._constructor(data=results)
362 except ValueError as err:
363 if arrays must all be same length in str(err):
~\anaconda3\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
527
528 elif isinstance(data, dict):
-- 529 mgr = init_dict(data, index, columns, dtype=dtype)
530 elif isinstance(data, ma.MaskedArray):
531 import numpy.ma.mrecords as mrecords
~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in init_dict(data, index, columns, dtype)
285 arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
286 ]
-- 287 return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
288
289
~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in arrays_to_mgr(arrays, arr_names, index, columns, dtype, verify_integrity)
83
84 # don't force copy because getting jammed in an ndarray anyway
--- 85 arrays = _homogenize(arrays, index, dtype)
86
87 columns = ensure_index(columns)
~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in _homogenize(data, index, dtype)
353 val = dict(val)
354 val = lib.fast_multiget(val, oindex._values, default=np.nan)
-- 355 val = sanitize_array(
356 val, index, dtype=dtype, copy=False, raise_cast_failure=False
357 )
~\anaconda3\lib\site-packages\pandas\core\construction.py in sanitize_array(data, index, dtype, copy, raise_cast_failure)
474 if isinstance(data, set):
475 # Raise only for unordered sets, e.g., not for dict_keys
-- 476 raise TypeError(Set type is unordered)
477 data = list(data)
478
TypeError: Set type is unordered
Topic text-classification error-handling pandas data-cleaning machine-learning
Category Data Science