提问人:Daniel 提问时间:11/15/2023 最后编辑:Daniel 更新时间:11/16/2023 访问量:41
使用 Adobe PDF Extract Python 的基于上下文的块
Context based chunks using Adobe PDF Extract Python
问:
所以,最近我遇到了 adobe pdf 提取 API,我正在使用 python,对于那些不了解 adobe 提取方法的人,给定一个 PDF,API 将提取的文本与每个段落/文本/列表一起返回为 JSON 对象,具有相应的样式,甚至坐标和路径,我试图完成的是从这个 json 生成的文件制作上下文块,但我希望能够检测所有标头和 pdf 中的子标题,我该如何继续?
链接到完整的 JSON 文件:JSON 文件
编辑:
这是从 pdf 中提取文本的 adobe extract api:
from adobe.pdfservices.operation.auth.credentials import Credentials
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
from adobe.pdfservices.operation.execution_context import ExecutionContext
from adobe.pdfservices.operation.io.file_ref import FileRef
from adobe.pdfservices.operation.pdfops.extract_pdf_operation import ExtractPDFOperation
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import ExtractPDFOptions
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import ExtractElementType
from adobe.pdfservices.operation.client_config import ClientConfig
import logging
import shutil
import os.path
import zipfile
import json
# zip_file = "./ExtractTextInfoFromPDF.zip"
zip_file = "./ExtractTextInfoFromPDF.zip"
if os.path.isfile(zip_file):
os.remove(zip_file)
input_pdf = "./microbe-guide-rtc/AutoTagOutput-tagged.pdf"
try:
base_path = os.path.dirname(os.path.dirname(os.path.abspath('__file__')))
client_config: ClientConfig = ClientConfig.builder(
).with_connect_timeout(10000).with_read_timeout(10000).build()
credentials = Credentials\
.service_principal_credentials_builder()\
.with_client_id('')\
.with_client_secret('')\
.build()
execution_context = ExecutionContext.create(
credentials, client_config=client_config)
extract_pdf_operation = ExtractPDFOperation.create_new()
source = FileRef.create_from_local_file(input_pdf)
extract_pdf_operation.set_input(source)
extract_pdf_options: ExtractPDFOptions = ExtractPDFOptions.builder() \
.with_element_to_extract(ExtractElementType.TEXT) \
.with_get_char_info(True) \
.build()
extract_pdf_operation.set_options(extract_pdf_options)
result: FileRef = extract_pdf_operation.execute(execution_context)
result.save_as(zip_file)
print("Successfully extracted information from PDF. Printing H1 Headers:\n")
archive = zipfile.ZipFile(zip_file, 'r')
jsonentry = archive.open('structuredData.json')
jsondata = jsonentry.read()
data = json.loads(jsondata)
for element in data["elements"]:
if (element["Path"].find("/Text")):
print(f"="*50, "\n",element.get("Text", None))
except (ServiceApiException, ServiceUsageException, SdkException):
logging.exception("Exception encountered while executing operation")
它生成一个相当大的 json 文件,但下面是 JSON 的示例:
{
"version": {
"json_export": "192",
"page_segmentation": "5",
"schema": "1.1.0",
"structure": "1.1065.0",
"table_structure": "5"
},
"extended_metadata": {
"ID_instance": "2C D4 05 C8 70 B8 B2 11 0A 00 67 45 8B 6B C6 23 ",
"ID_permanent": "46 30 20 41 30 20 30 30 20 43 38 20 37 30 20 42 38 20 42 32 20 31 31 20 30 41 20 30 30 20 36 37 20 34 35 20 38 42 20 36 42 20 43 36 20 32 33 20 ",
"pdf_version": "1.6",
"pdfa_compliance_level": "",
"is_encrypted": false,
"has_acroform": false,
"is_digitally_signed": false,
"pdfua_compliance_level": "",
"page_count": 8,
"has_embedded_files": false,
"is_certified": false,
"is_XFA": false,
"language": "EN-US"
},
"elements": [
{
"Bounds": [
140.27999877929688,
692.2284545898438,
474.11688232421875,
720.9082183837891
],
"CharBounds": [
[
140.27999877929688,
705.6679840087891,
147.28524780273438,
720.9082183837891
],
[
147.29959106445312,
705.6679840087891,
154.4805908203125,
720.9082183837891
],
[
154.43995666503906,
705.6679840087891,
157.38265991210938,
720.9082183837891
],
[
157.38267517089844,
705.6679840087891,
164.31118774414062,
720.9082183837891
],
[
164.2825927734375,
705.6679840087891,
169.65196228027344,
720.9082183837891
],
[
169.68263244628906,
705.6679840087891,
172.1421356201172,
720.9082183837891
],
[
172.1421661376953,
705.6679840087891,
177.1930389404297,
720.9082183837891
],
[
177.12167358398438,
705.6679840087891,
184.55516052246094,
720.9082183837891
],
[
184.56179809570312,
705.6679840087891,
190.754638671875,
720.9082183837891
],
[
190.7545928955078,
705.6679840087891,
193.214111328125,
720.9082183837891
],
[
193.26243591308594,
705.6679840087891,
198.4671173095703,
720.9082183837891
],
[
198.48240661621094,
705.6679840087891,
205.6634063720703,
720.9082183837891
],
[
205.62168884277344,
705.6679840087891,
211.79257202148438,
720.9082183837891
],
[
211.80239868164062,
705.6679840087891,
221.40989685058594,
720.9082183837891
],
[
221.4022979736328,
705.6679840087891,
224.34500122070312,
720.9082183837891
],
[
224.3450164794922,
705.6679840087891,
229.79112243652344,
720.9082183837891
],
[
229.9251251220703,
705.6679840087891,
235.37123107910156,
720.9082183837891
],
[
235.38555908203125,
705.6679840087891,
238.32826232910156,
720.9082183837891
],
[
238.32827758789062,
705.6679840087891,
245.5751190185547,
720.9082183837891
],
[
245.5883331298828,
705.6679840087891,
252.5935821533203,
720.9082183837891
],
[
252.60902404785156,
705.6679840087891,
255.0685272216797,
720.9082183837891
],
[
255.0685577392578,
705.6679840087891,
260.9209442138672,
720.9082183837891
],
[
260.8891296386719,
705.6679840087891,
266.2584991455078,
720.9082183837891
],
[
266.22877502441406,
705.6679840087891,
272.42161560058594,
720.9082183837891
],
[
272.4084014892578,
705.6679840087891,
282.0158996582031,
720.9082183837891
],
[
282.00828552246094,
705.6679840087891,
284.95098876953125,
720.9082183837891
],
[
284.9510040283203,
705.6679840087891,
290.39710998535156,
720.9082183837891
],
[
290.4707336425781,
705.6679840087891,
292.93023681640625,
720.9082183837891
],
[
292.9302673339844,
705.6679840087891,
299.59523010253906,
720.9082183837891
],
[
299.58970642089844,
705.6679840087891,
305.4421081542969,
720.9082183837891
],
[
305.4695739746094,
705.6679840087891,
311.3219757080078,
720.9082183837891
],
[
311.2901611328125,
705.6679840087891,
315.94580078125,
720.9082183837891
],
[
315.96990966796875,
705.6679840087891,
318.91261291503906,
720.9082183837891
],
[
318.9126281738281,
705.6679840087891,
324.732177734375,
720.9082183837891
],
[
324.7321014404297,
705.6679840087891,
331.39707946777344,
720.9082183837891
],
[
330.5515899658203,
705.6679840087891,
335.9976806640625,
720.9082183837891
],
[
336.0120086669922,
705.6679840087891,
338.9547119140625,
720.9082183837891
],
[
338.95472717285156,
705.6679840087891,
346.3882141113281,
720.9082183837891
],
[
346.3882751464844,
705.6679840087891,
353.63511657714844,
720.9082183837891
],
[
353.64833068847656,
705.6679840087891,
358.8529968261719,
720.9082183837891
],
[
358.8090057373047,
705.6679840087891,
361.2685089111328,
720.9082183837891
],
[
361.26853942871094,
705.6679840087891,
366.3194122314453,
720.9082183837891
],
[
366.2491455078125,
705.6679840087891,
373.6826171875,
720.9082183837891
],
[
373.68927001953125,
705.6679840087891,
379.8821105957031,
720.9082183837891
],
[
379.86888122558594,
705.6679840087891,
382.3283996582031,
720.9082183837891
],
[
382.3284149169922,
705.6679840087891,
391.9359130859375,
720.9082183837891
],
[
391.9283142089844,
705.6679840087891,
394.8710174560547,
720.9082183837891
],
[
394.87103271484375,
705.6679840087891,
400.6905822753906,
720.9082183837891
],
[
400.6905059814453,
705.6679840087891,
406.8833465576172,
720.9082183837891
],
[
406.7504425048828,
705.6679840087891,
414.1839294433594,
720.9082183837891
],
[
414.19056701660156,
705.6679840087891,
421.6240539550781,
720.9082183837891
],
[
421.6306915283203,
705.6679840087891,
427.8235321044922,
720.9082183837891
],
[
427.7510223388672,
705.6679840087891,
434.7562713623047,
720.9082183837891
],
[
434.8310089111328,
705.6679840087891,
441.4959716796875,
720.9082183837891
],
[
441.49046325683594,
705.6679840087891,
448.7373046875,
720.9082183837891
],
[
448.7505187988281,
705.6679840087891,
451.6932067871094,
720.9082183837891
],
[
451.69322204589844,
705.6679840087891,
456.8979034423828,
720.9082183837891
],
[
456.8528137207031,
705.6679840087891,
466.46031188964844,
720.9082183837891
],
[
466.45269775390625,
705.6679840087891,
471.6573791503906,
720.9082183837891
],
[
471.6573028564453,
705.6679840087891,
474.11680603027344,
720.9082183837891
],
[
154.80653381347656,
692.2284545898438,
161.73504638671875,
707.4687042236328
],
[
161.70645141601562,
692.2284545898438,
167.07582092285156,
707.4687042236328
],
[
167.0460968017578,
692.2284545898438,
173.54637145996094,
707.4687042236328
],
[
173.5869598388672,
692.2284545898438,
178.95632934570312,
707.4687042236328
],
[
178.92662048339844,
692.2284545898438,
183.58226013183594,
707.4687042236328
],
[
183.36700439453125,
692.2284545898438,
190.8004913330078,
707.4687042236328
],
[
190.86642456054688,
692.2284545898438,
196.71881103515625,
707.4687042236328
],
[
196.68699645996094,
692.2284545898438,
202.05636596679688,
707.4687042236328
],
[
202.0859375,
692.2284545898438,
209.0144500732422,
707.4687042236328
],
[
209.014404296875,
692.2284545898438,
211.47390747070312,
707.4687042236328
],
[
211.50686645507812,
692.2284545898438,
218.6878662109375,
707.4687042236328
],
[
218.64614868164062,
692.2284545898438,
223.850830078125,
707.4687042236328
],
[
223.80682373046875,
692.2284545898438,
226.74952697753906,
707.4687042236328
],
[
226.74954223632812,
692.2284545898438,
233.9963836669922,
707.4687042236328
],
[
234.0095977783203,
692.2284545898438,
241.0148468017578,
707.4687042236328
],
[
241.03028869628906,
692.2284545898438,
243.4897918701172,
707.4687042236328
],
[
243.4898223876953,
692.2284545898438,
250.4950714111328,
707.4687042236328
],
[
250.50941467285156,
692.2284545898438,
255.8787841796875,
707.4687042236328
],
[
255.85015869140625,
692.2284545898438,
263.0970001220703,
707.4687042236328
],
[
263.1695098876953,
692.2284545898438,
268.53887939453125,
707.4687042236328
],
[
268.5695495605469,
692.2284545898438,
274.0156555175781,
707.4687042236328
],
[
274.0299835205078,
692.2284545898438,
276.97267150878906,
707.4687042236328
],
[
276.9727020263672,
692.2284545898438,
282.79225158691406,
707.4687042236328
],
[
282.79217529296875,
692.2284545898438,
285.2516784667969,
707.4687042236328
],
[
285.251708984375,
692.2284545898438,
290.62107849121094,
707.4687042236328
],
[
290.5924530029297,
692.2284545898438,
297.83929443359375,
707.4687042236328
],
[
297.8514099121094,
692.2284545898438,
304.8566589355469,
707.4687042236328
],
[
304.8721008300781,
692.2284545898438,
307.81480407714844,
707.4687042236328
],
[
307.8148193359375,
692.2284545898438,
315.06166076660156,
707.4687042236328
],
[
315.0748748779297,
692.2284545898438,
320.4442443847656,
707.4687042236328
],
[
320.4156188964844,
692.2284545898438,
325.7849884033203,
707.4687042236328
],
[
325.81565856933594,
692.2284545898438,
332.0084991455078,
707.4687042236328
],
[
331.9952850341797,
692.2284545898438,
334.93798828125,
707.4687042236328
],
[
334.93800354003906,
692.2284545898438,
342.1848449707031,
707.4687042236328
],
[
342.19915771484375,
692.2284545898438,
349.20440673828125,
707.4687042236328
],
[
349.21875,
692.2284545898438,
351.6782531738281,
707.4687042236328
],
[
351.67828369140625,
692.2284545898438,
358.8592834472656,
707.4687042236328
],
[
358.8186492919922,
692.2284545898438,
366.06549072265625,
707.4687042236328
],
[
366.0787048339844,
692.2284545898438,
373.00721740722656,
707.4687042236328
],
[
373.03900146484375,
692.2284545898438,
378.4083709716797,
707.4687042236328
],
[
378.378662109375,
692.2284545898438,
384.5714874267578,
707.4687042236328
],
[
384.5714569091797,
692.2284545898438,
387.0309600830078,
707.4687042236328
],
[
387.0782012939453,
692.2284545898438,
392.6561584472656,
707.4687042236328
],
[
392.65611267089844,
692.2284545898438,
395.1156311035156,
707.4687042236328
],
[
395.1156463623047,
692.2284545898438,
400.9352111816406,
707.4687042236328
],
[
400.9351348876953,
692.2284545898438,
405.9860076904297,
707.4687042236328
],
[
405.9750213623047,
692.2284545898438,
412.16786193847656,
707.4687042236328
],
[
412.15464782714844,
692.2284545898438,
414.61415100097656,
707.4687042236328
],
[
414.6141815185547,
692.2284545898438,
420.46656799316406,
707.4687042236328
],
[
419.71446228027344,
692.2284545898438,
426.3794403076172,
707.4687042236328
],
[
426.43431091308594,
692.2284545898438,
432.62713623046875,
707.4687042236328
],
[
432.49424743652344,
692.2284545898438,
437.9403533935547,
707.4687042236328
],
[
437.9535827636719,
692.2284545898438,
440.4130859375,
707.4687042236328
],
[
440.4131164550781,
692.2284545898438,
445.99107360839844,
707.4687042236328
],
[
446.0536193847656,
692.2284545898438,
451.63157653808594,
707.4687042236328
],
[
451.63153076171875,
692.2284545898438,
457.20948791503906,
707.4687042236328
],
[
457.20945739746094,
692.2284545898438,
459.66896057128906,
707.4687042236328
]
],
"Font": {
"alt_family_name": "Calibri",
"embedded": true,
"encoding": "WinAnsiEncoding",
"family_name": "Calibri",
"font_type": "TrueType",
"italic": false,
"monospaced": false,
"name": "XWYBVT+Calibri-Bold",
"subset": true,
"weight": 700
},
"HasClip": false,
"Lang": "en",
"Page": 0,
"Path": "//Document/H1",
"Text": "GUIDE FOR SUBMITTING PERMIT APPLICATIONS FOR MICROORGANISMS DEVELOPED USING GENETIC ENGINEERING UNDER 7 CFR PART 340 ",
"TextSize": 10.979995727539062,
"attributes": {
"LineHeight": 13.5,
"SpaceAfter": 13.75,
"TextAlign": "Center"
}
},
}
每个提取的文本都将与该对象相同,具有差异坐标和字符边界
答: 暂无答案
评论