无法将自定义函数添加到 Python 的记录链接库

Cannot add custom function to Python's recordlinkage library

提问人:user22 提问时间:4/14/2023 更新时间:4/14/2023 访问量:64

问:

尝试将自定义函数添加到 Python 的记录链接库中,但得到 .在自定义函数中,我只计算两个字符串token_set_ratio。KeyError: 0

import recordlinkage

indexer = recordlinkage.Index()
indexer.sortedneighbourhood(left_on='desc', right_on='desc')
full_candidate_links = indexer.index(df_a, df_b)

from recordlinkage.base import BaseCompareFeature

class GetTokenSetRatio(BaseCompareFeature):
    def _compute_vectorized(self, s1, s2):
        return fuzz.token_set_ratio(s1, s2)

c = recordlinkage.Compare()

c.string('desc', 'desc', method='jarowinkler', label = 'jarowinkler')
c.string('desc', 'desc', method='jaro', label = 'jaro')
c.string('desc', 'desc', method='levenshtein', label = 'levenshtein')
c.string('desc', 'desc', method='damerau_levenshtein', label = 'damerau_levenshtein')
c.string('desc', 'desc', method='cosine', label = 'cosine')
c.string('desc', 'desc', method='qgram', label = 'qgram')
c.add(GetTokenSetRatio('desc', 'desc', label='token_set'))

# The comparison vectors
feature_vectors = c.compute(full_candidate_links, df_a, df_b)

全栈跟踪

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<command-2705282071834506> in <module>
     10 
     11 # The comparison vectors
---> 12 feature_vectors = c.compute(full_candidate_links, df_a, df_b)

/databricks/python/lib/python3.8/site-packages/recordlinkage/base.py in compute(self, pairs, x, x_link)
    860 
    861         if self.n_jobs == 1:
--> 862             results = self._compute(pairs, x, x_link)
    863         elif self.n_jobs > 1:
    864             results = self._compute_parallel(pairs,

/databricks/python/lib/python3.8/site-packages/recordlinkage/base.py in _compute(self, pairs, x, x_link)
    725                     [df_b_indexed[lbl] for lbl in listify(feat.labels_right)])
    726 
--> 727             result = feat._compute(data1, data2)
    728             features.append((result, feat.label))
    729 

/databricks/python/lib/python3.8/site-packages/recordlinkage/base.py in _compute(self, left_on, right_on)
    450             numpy.ndarray objects.
    451         """
--> 452         result = self._compute_vectorized(*tuple(left_on + right_on))
    453 
    454         return result

<command-2481998006509980> in _compute_vectorized(self, s1, s2)
      3 class GetTokenSetRatio(BaseCompareFeature):
      4     def _compute_vectorized(self, s1, s2):
----> 5         return fuzz.token_set_ratio(s1, s2)

src/rapidfuzz/fuzz_cpp_impl.pyx in rapidfuzz.fuzz_cpp_impl_avx2.token_set_ratio()

./src/rapidfuzz/cpp_common.pxd in cpp_common.preprocess_strings()

src/rapidfuzz/utils_cpp.pyx in rapidfuzz.utils_cpp.default_process_capi()

./src/rapidfuzz/cpp_common.pxd in cpp_common.conv_sequence()

./src/rapidfuzz/cpp_common.pxd in cpp_common.hash_sequence()

./src/rapidfuzz/cpp_common.pxd in cpp_common.hash_sequence()

/databricks/python/lib/python3.8/site-packages/pandas/core/series.py in __getitem__(self, key)
    851 
    852         elif key_is_scalar:
--> 853             return self._get_value(key)
    854 
    855         if is_hashable(key):

/databricks/python/lib/python3.8/site-packages/pandas/core/series.py in _get_value(self, label, takeable)
    959 
    960         # Similar to Index.get_value, but we do not fall back to positional
--> 961         loc = self.index.get_loc(label)
    962         return self.index._get_values_for_loc(self, loc, label)
    963 

/databricks/python/lib/python3.8/site-packages/pandas/core/indexes/multi.py in get_loc(self, key, method)
   2874 
   2875         if not isinstance(key, tuple):
-> 2876             loc = self._get_level_indexer(key, level=0)
   2877             return _maybe_to_slice(loc)
   2878 

/databricks/python/lib/python3.8/site-packages/pandas/core/indexes/multi.py in _get_level_indexer(self, key, level, indexer)
   3163                 if not locs.any():
   3164                     # The label is present in self.levels[level] but unused:
-> 3165                     raise KeyError(key)
   3166                 return locs
   3167 

KeyError: 0

python-3.x pandas 记录链接

评论


答: 暂无答案