提问人:user22 提问时间:4/14/2023 更新时间:4/14/2023 访问量:64
无法将自定义函数添加到 Python 的记录链接库
Cannot add custom function to Python's recordlinkage library
问:
尝试将自定义函数添加到 Python 的记录链接库中,但得到 .在自定义函数中,我只计算两个字符串token_set_ratio。KeyError: 0
import recordlinkage
indexer = recordlinkage.Index()
indexer.sortedneighbourhood(left_on='desc', right_on='desc')
full_candidate_links = indexer.index(df_a, df_b)
from recordlinkage.base import BaseCompareFeature
class GetTokenSetRatio(BaseCompareFeature):
def _compute_vectorized(self, s1, s2):
return fuzz.token_set_ratio(s1, s2)
c = recordlinkage.Compare()
c.string('desc', 'desc', method='jarowinkler', label = 'jarowinkler')
c.string('desc', 'desc', method='jaro', label = 'jaro')
c.string('desc', 'desc', method='levenshtein', label = 'levenshtein')
c.string('desc', 'desc', method='damerau_levenshtein', label = 'damerau_levenshtein')
c.string('desc', 'desc', method='cosine', label = 'cosine')
c.string('desc', 'desc', method='qgram', label = 'qgram')
c.add(GetTokenSetRatio('desc', 'desc', label='token_set'))
# The comparison vectors
feature_vectors = c.compute(full_candidate_links, df_a, df_b)
全栈跟踪
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<command-2705282071834506> in <module>
10
11 # The comparison vectors
---> 12 feature_vectors = c.compute(full_candidate_links, df_a, df_b)
/databricks/python/lib/python3.8/site-packages/recordlinkage/base.py in compute(self, pairs, x, x_link)
860
861 if self.n_jobs == 1:
--> 862 results = self._compute(pairs, x, x_link)
863 elif self.n_jobs > 1:
864 results = self._compute_parallel(pairs,
/databricks/python/lib/python3.8/site-packages/recordlinkage/base.py in _compute(self, pairs, x, x_link)
725 [df_b_indexed[lbl] for lbl in listify(feat.labels_right)])
726
--> 727 result = feat._compute(data1, data2)
728 features.append((result, feat.label))
729
/databricks/python/lib/python3.8/site-packages/recordlinkage/base.py in _compute(self, left_on, right_on)
450 numpy.ndarray objects.
451 """
--> 452 result = self._compute_vectorized(*tuple(left_on + right_on))
453
454 return result
<command-2481998006509980> in _compute_vectorized(self, s1, s2)
3 class GetTokenSetRatio(BaseCompareFeature):
4 def _compute_vectorized(self, s1, s2):
----> 5 return fuzz.token_set_ratio(s1, s2)
src/rapidfuzz/fuzz_cpp_impl.pyx in rapidfuzz.fuzz_cpp_impl_avx2.token_set_ratio()
./src/rapidfuzz/cpp_common.pxd in cpp_common.preprocess_strings()
src/rapidfuzz/utils_cpp.pyx in rapidfuzz.utils_cpp.default_process_capi()
./src/rapidfuzz/cpp_common.pxd in cpp_common.conv_sequence()
./src/rapidfuzz/cpp_common.pxd in cpp_common.hash_sequence()
./src/rapidfuzz/cpp_common.pxd in cpp_common.hash_sequence()
/databricks/python/lib/python3.8/site-packages/pandas/core/series.py in __getitem__(self, key)
851
852 elif key_is_scalar:
--> 853 return self._get_value(key)
854
855 if is_hashable(key):
/databricks/python/lib/python3.8/site-packages/pandas/core/series.py in _get_value(self, label, takeable)
959
960 # Similar to Index.get_value, but we do not fall back to positional
--> 961 loc = self.index.get_loc(label)
962 return self.index._get_values_for_loc(self, loc, label)
963
/databricks/python/lib/python3.8/site-packages/pandas/core/indexes/multi.py in get_loc(self, key, method)
2874
2875 if not isinstance(key, tuple):
-> 2876 loc = self._get_level_indexer(key, level=0)
2877 return _maybe_to_slice(loc)
2878
/databricks/python/lib/python3.8/site-packages/pandas/core/indexes/multi.py in _get_level_indexer(self, key, level, indexer)
3163 if not locs.any():
3164 # The label is present in self.levels[level] but unused:
-> 3165 raise KeyError(key)
3166 return locs
3167
KeyError: 0
答: 暂无答案
评论