DatasetGenerationError:尝试在本地加载通用语音时生成数据集时出错

DatasetGenerationError: An error occurred while generating the dataset when trying to load the Common Voice locally

提问人:FOXASDF 提问时间:11/9/2023 更新时间:11/9/2023 访问量:125

问:

之后我下载了整个 Common Voice,我尝试加载数据集,但它无法加载,我什至从 pip 重新安装了数据集库。 一旦进入数据生成过程,它就会给出错误。

from datasets import load_dataset


test = load_dataset("D:\Senior\cv-corpus-15.0-2023-09-08",'en',split="test")

埃罗尔:

ArrowInvalid 回溯(最近一次调用最后一次)

File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\builder.py:1940, in ArrowBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
   1933     writer = writer_class(
   1934         features=writer._features,
   1935         path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
ref='c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\builder.py:1'>1</a>;32m   (...)
   1938         embed_local_files=embed_local_files,
   1939     )
-> 1940 writer.write_table(table)
   1941 num_examples_progress_update += len(table)

File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\arrow_writer.py:572, in ArrowWriter.write_table(self, pa_table, writer_batch_size)
    571 pa_table = pa_table.combine_chunks()
--> 572 pa_table = table_cast(pa_table, self._schema)
    573 if self.embed_local_files:

File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\table.py:2328, in table_cast(table, schema)
   2327 if table.schema != schema:
-> 2328     return cast_table_to_schema(table, schema)
   2329 elif table.schema.metadata != schema.metadata:

File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\table.py:2287, in cast_table_to_schema(table, schema)
   2286     raise ValueError(f"Couldn't cast\n{table.schema}\nto\n{features}\nbecause column names don't match")
-> 2287 arrays = [cast_array_to_feature(table[name], feature) for name, feature in features.items()]
   2288 return pa.Table.from_arrays(arrays, schema=schema)

File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\table.py:2287, in <listcomp>(.0)
   2286     raise ValueError(f"Couldn't cast\n{table.schema}\nto\n{features}\nbecause column names don't match")
-> 2287 arrays = [cast_array_to_feature(table[name], feature) for name, feature in features.items()]
   2288 return pa.Table.from_arrays(arrays, schema=schema)

File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\table.py:1831, in _wrap_for_chunked_arrays.<locals>.wrapper(array, *args, **kwargs)
   1830 if isinstance(array, pa.ChunkedArray):
-> 1831     return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks])
   1832 else:

File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\table.py:1831, in <listcomp>(.0)
   1830 if isinstance(array, pa.ChunkedArray):
-> 1831     return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks])
   1832 else:

File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\table.py:2143, in cast_array_to_feature(array, feature, allow_number_to_str)
   2142 elif not isinstance(feature, (Sequence, dict, list, tuple)):
-> 2143     return array_cast(array, feature(), allow_number_to_str=allow_number_to_str)
   2144 raise TypeError(f"Couldn't cast array of type\n{array.type}\nto\n{feature}")

File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\table.py:1833, in _wrap_for_chunked_arrays.<locals>.wrapper(array, *args, **kwargs)
   1832 else:
-> 1833     return func(array, *args, **kwargs)

File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\table.py:2027, in array_cast(array, pa_type, allow_number_to_str)
   2026         raise TypeError(f"Couldn't cast array of type {array.type} to {pa_type}")
-> 2027     return array.cast(pa_type)
   2028 raise TypeError(f"Couldn't cast array of type\n{array.type}\nto\n{pa_type}")

File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\pyarrow\array.pxi:935, in pyarrow.lib.Array.cast()

File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\pyarrow\compute.py:400, in cast(arr, target_type, safe, options, memory_pool)
    399         options = CastOptions.safe(target_type)
--> 400 return call_function("cast", [arr], options, memory_pool)

File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\pyarrow\_compute.pyx:572, in pyarrow._compute.call_function()

File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\pyarrow\_compute.pyx:367, in pyarrow._compute.Function.call()

File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\pyarrow\error.pxi:144, in pyarrow.lib.pyarrow_internal_check_status()

File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\pyarrow\error.pxi:100, in pyarrow.lib.check_status()

ArrowInvalid: Failed to parse string: 'Benchmark' as a scalar of type double

The above exception was the direct cause of the following exception:

DatasetGenerationError                    Traceback (most recent call last)
c:\Users\foxas\OneDrive\Desktop\Senior\Models testing\Whisper Tiny.ipynb Cell 8 line 4
      1 from datasets import load_dataset
----> 4 test = load_dataset("D:\Senior\cv-corpus-15.0-2023-09-08",'en',split="test")

File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\load.py:2153, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, token, use_auth_token, task, streaming, num_proc, storage_options, **config_kwargs)
   2150 try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES
   2152 # Download and prepare data
-> 2153 builder_instance.download_and_prepare(
   2154     download_config=download_config,
   2155     download_mode=download_mode,
   2156     verification_mode=verification_mode,
   2157     try_from_hf_gcs=try_from_hf_gcs,
   2158     num_proc=num_proc,
   2159     storage_options=storage_options,
   2160 )
   2162 # Build dataset for splits
   2163 keep_in_memory = (
   2164     keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
   2165 )

File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\builder.py:954, in DatasetBuilder.download_and_prepare(self, output_dir, download_config, download_mode, verification_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)
    952     if num_proc is not None:
    953         prepare_split_kwargs["num_proc"] = num_proc
--> 954     self._download_and_prepare(
    955         dl_manager=dl_manager,
    956         verification_mode=verification_mode,
    957         **prepare_split_kwargs,
    958         **download_and_prepare_kwargs,
    959     )
    960 # Sync info
    961 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())

File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\builder.py:1049, in DatasetBuilder._download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs)
   1045 split_dict.add(split_generator.split_info)
   1047 try:
   1048     # Prepare split will record examples associated to the split
-> 1049     self._prepare_split(split_generator, **prepare_split_kwargs)
   1050 except OSError as e:
   1051     raise OSError(
   1052         "Cannot find data file. "
   1053         + (self.manual_download_instructions or "")
   1054         + "\nOriginal error:\n"
   1055         + str(e)
   1056     ) from None

File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\builder.py:1813, in ArrowBasedBuilder._prepare_split(self, split_generator, file_format, num_proc, max_shard_size)
   1811 job_id = 0
   1812 with pbar:
-> 1813     for job_id, done, content in self._prepare_split_single(
   1814         gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
   1815     ):
   1816         if done:
   1817             result = content

File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\builder.py:1958, in ArrowBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
   1956     if isinstance(e, SchemaInferenceError) and e.__context__ is not None:
   1957         e = e.__context__
-> 1958     raise DatasetGenerationError("An error occurred while generating the dataset") from e
   1960 yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)

DatasetGenerationError:生成数据集时出错

python 数据集 huggingface huggingface-datasets

评论


答: 暂无答案