提问人:FOXASDF 提问时间:11/9/2023 更新时间:11/9/2023 访问量:125
DatasetGenerationError:尝试在本地加载通用语音时生成数据集时出错
DatasetGenerationError: An error occurred while generating the dataset when trying to load the Common Voice locally
问:
之后我下载了整个 Common Voice,我尝试加载数据集,但它无法加载,我什至从 pip 重新安装了数据集库。 一旦进入数据生成过程,它就会给出错误。
from datasets import load_dataset
test = load_dataset("D:\Senior\cv-corpus-15.0-2023-09-08",'en',split="test")
埃罗尔:
ArrowInvalid 回溯(最近一次调用最后一次)
File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\builder.py:1940, in ArrowBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
1933 writer = writer_class(
1934 features=writer._features,
1935 path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
ref='c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\builder.py:1'>1</a>;32m (...)
1938 embed_local_files=embed_local_files,
1939 )
-> 1940 writer.write_table(table)
1941 num_examples_progress_update += len(table)
File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\arrow_writer.py:572, in ArrowWriter.write_table(self, pa_table, writer_batch_size)
571 pa_table = pa_table.combine_chunks()
--> 572 pa_table = table_cast(pa_table, self._schema)
573 if self.embed_local_files:
File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\table.py:2328, in table_cast(table, schema)
2327 if table.schema != schema:
-> 2328 return cast_table_to_schema(table, schema)
2329 elif table.schema.metadata != schema.metadata:
File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\table.py:2287, in cast_table_to_schema(table, schema)
2286 raise ValueError(f"Couldn't cast\n{table.schema}\nto\n{features}\nbecause column names don't match")
-> 2287 arrays = [cast_array_to_feature(table[name], feature) for name, feature in features.items()]
2288 return pa.Table.from_arrays(arrays, schema=schema)
File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\table.py:2287, in <listcomp>(.0)
2286 raise ValueError(f"Couldn't cast\n{table.schema}\nto\n{features}\nbecause column names don't match")
-> 2287 arrays = [cast_array_to_feature(table[name], feature) for name, feature in features.items()]
2288 return pa.Table.from_arrays(arrays, schema=schema)
File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\table.py:1831, in _wrap_for_chunked_arrays.<locals>.wrapper(array, *args, **kwargs)
1830 if isinstance(array, pa.ChunkedArray):
-> 1831 return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks])
1832 else:
File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\table.py:1831, in <listcomp>(.0)
1830 if isinstance(array, pa.ChunkedArray):
-> 1831 return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks])
1832 else:
File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\table.py:2143, in cast_array_to_feature(array, feature, allow_number_to_str)
2142 elif not isinstance(feature, (Sequence, dict, list, tuple)):
-> 2143 return array_cast(array, feature(), allow_number_to_str=allow_number_to_str)
2144 raise TypeError(f"Couldn't cast array of type\n{array.type}\nto\n{feature}")
File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\table.py:1833, in _wrap_for_chunked_arrays.<locals>.wrapper(array, *args, **kwargs)
1832 else:
-> 1833 return func(array, *args, **kwargs)
File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\table.py:2027, in array_cast(array, pa_type, allow_number_to_str)
2026 raise TypeError(f"Couldn't cast array of type {array.type} to {pa_type}")
-> 2027 return array.cast(pa_type)
2028 raise TypeError(f"Couldn't cast array of type\n{array.type}\nto\n{pa_type}")
File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\pyarrow\array.pxi:935, in pyarrow.lib.Array.cast()
File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\pyarrow\compute.py:400, in cast(arr, target_type, safe, options, memory_pool)
399 options = CastOptions.safe(target_type)
--> 400 return call_function("cast", [arr], options, memory_pool)
File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\pyarrow\_compute.pyx:572, in pyarrow._compute.call_function()
File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\pyarrow\_compute.pyx:367, in pyarrow._compute.Function.call()
File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\pyarrow\error.pxi:144, in pyarrow.lib.pyarrow_internal_check_status()
File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\pyarrow\error.pxi:100, in pyarrow.lib.check_status()
ArrowInvalid: Failed to parse string: 'Benchmark' as a scalar of type double
The above exception was the direct cause of the following exception:
DatasetGenerationError Traceback (most recent call last)
c:\Users\foxas\OneDrive\Desktop\Senior\Models testing\Whisper Tiny.ipynb Cell 8 line 4
1 from datasets import load_dataset
----> 4 test = load_dataset("D:\Senior\cv-corpus-15.0-2023-09-08",'en',split="test")
File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\load.py:2153, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, token, use_auth_token, task, streaming, num_proc, storage_options, **config_kwargs)
2150 try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES
2152 # Download and prepare data
-> 2153 builder_instance.download_and_prepare(
2154 download_config=download_config,
2155 download_mode=download_mode,
2156 verification_mode=verification_mode,
2157 try_from_hf_gcs=try_from_hf_gcs,
2158 num_proc=num_proc,
2159 storage_options=storage_options,
2160 )
2162 # Build dataset for splits
2163 keep_in_memory = (
2164 keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
2165 )
File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\builder.py:954, in DatasetBuilder.download_and_prepare(self, output_dir, download_config, download_mode, verification_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)
952 if num_proc is not None:
953 prepare_split_kwargs["num_proc"] = num_proc
--> 954 self._download_and_prepare(
955 dl_manager=dl_manager,
956 verification_mode=verification_mode,
957 **prepare_split_kwargs,
958 **download_and_prepare_kwargs,
959 )
960 # Sync info
961 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())
File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\builder.py:1049, in DatasetBuilder._download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs)
1045 split_dict.add(split_generator.split_info)
1047 try:
1048 # Prepare split will record examples associated to the split
-> 1049 self._prepare_split(split_generator, **prepare_split_kwargs)
1050 except OSError as e:
1051 raise OSError(
1052 "Cannot find data file. "
1053 + (self.manual_download_instructions or "")
1054 + "\nOriginal error:\n"
1055 + str(e)
1056 ) from None
File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\builder.py:1813, in ArrowBasedBuilder._prepare_split(self, split_generator, file_format, num_proc, max_shard_size)
1811 job_id = 0
1812 with pbar:
-> 1813 for job_id, done, content in self._prepare_split_single(
1814 gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
1815 ):
1816 if done:
1817 result = content
File c:\Users\foxas\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\builder.py:1958, in ArrowBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
1956 if isinstance(e, SchemaInferenceError) and e.__context__ is not None:
1957 e = e.__context__
-> 1958 raise DatasetGenerationError("An error occurred while generating the dataset") from e
1960 yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)
DatasetGenerationError:生成数据集时出错
答: 暂无答案
评论