提问人:Mytho 提问时间:11/2/2023 最后编辑:Mytho 更新时间:11/7/2023 访问量:65
如何查找具有给定颜色的 PDF 文本并替换它
How to Find PDF Text with a Given Color and Replace it
问:
我想查找具有给定颜色的文本,然后将其替换为新颜色。我知道 Acrobat 可以做到这一点,但是在进行低级编辑时会破坏标签树。我想使用 iText 来自动执行此任务。
我的应用程序主要在标签树中工作,用于与可访问性相关的操作。但是,我不认为颜色信息位于那里。我已经遍历了一堆标签,但我没有看到任何颜色。此外,PAC 和 iText Rups 都不会在标签树中显示颜色信息。
我假设我必须转到 PDF 的“较低级别”,但我不确定该怎么做。
我下载了 PDF 标准的副本,发现颜色信息存储在标识符“rg”之后。iText Rups 在内容流中显示以下内容:
EMC
/Standard <</MCID 0 >> BDC
q
0 0 0 rg --------------------- Sets color to black for this word
BT
56.8 724.1 Td
/F1 12 Tf
<0102030405> Tj
ET
Q
EMC
如何在 iText 中访问此流?或者,有没有办法在不达到这个级别的情况下做到这一点?如果可以的话,我更愿意使用 PdfStructs 或标签树中的某些东西。
编辑:在查看了链接的评论后,我想出了这个:
class Program
{
static void Main(string[] args)
{
string inputPdfPath = "input.pdf";
string outputPdfPath = "output.pdf";
// set up PDF
PdfReader reader = new(inputPdfPath);
PdfWriter writer = new(outputPdfPath);
PdfDocument pdfDocument = new PdfDocument(reader, writer);
pdfDocument.SetTagged();
// Search each page with the processor
MyProcessor editor = new MyProcessor(new Listener());
for (int i = 1; i <= pdfDocument.GetNumberOfPages(); i++)
{
editor.EditPage(pdfDocument, i);
}
pdfDocument.Close();
}
}
public class MyProcessor : PdfCanvasProcessor
{
public MyProcessor(IEventListener eventListener) : base(eventListener) {}
protected override void EventOccurred(IEventData data, EventType type)
{
// Only inspect text, ignore null objects
if (data != null) {
if (type is EventType.RENDER_TEXT) {
TextRenderInfo txt = (TextRenderInfo)data;
Console.WriteLine(txt.GetText());
foreach (var color in txt.GetFillColor().GetColorValue())
{
Console.Write(color + " ");
}
Console.WriteLine("");
}
}
base.EventOccurred(data, type);
}
public void EditPage(PdfDocument pdfDocument, int pageNumber) {
PdfPage page = pdfDocument.GetPage(pageNumber);
PdfResources pdfResources = page.GetResources();
PdfCanvas pdfCanvas = new PdfCanvas(new PdfStream(), pdfResources, pdfDocument);
ProcessContent(page.GetContentBytes(), pdfResources);
}
}
public class Listener : IEventListener
{
public void EventOccurred(IEventData data, EventType type) {}
public ICollection<EventType> GetSupportedEvents()
{
return new Collection<EventType>() {
EventType.BEGIN_TEXT, EventType.RENDER_TEXT, EventType.END_TEXT,
EventType.RENDER_IMAGE, EventType.RENDER_PATH, EventType.CLIP_PATH_CHANGED
};
}
}
我现在可以找到特定的颜色。我仍然需要弄清楚如何修改现有颜色。不过,看起来我必须重写页面内容,而不是修改现有文档。
答:
1赞
Mytho
11/7/2023
#1
在使用链接的 java 示例后,我设法让它工作。下面的代码将采用目标颜色和替换颜色。它将搜索文档,并用替换目标颜色的任何实例。
这应该适用于所有色彩空间。我使用 RGB 作为搜索条件,但如果需要,您可以将其更改为 DeviceCMYK 或 DeviceGray。
using System.Collections.ObjectModel;
using iText.Kernel.Colors;
using iText.Kernel.Exceptions;
using iText.Kernel.Pdf;
using iText.Kernel.Pdf.Canvas;
using iText.Kernel.Pdf.Canvas.Parser;
using iText.Kernel.Pdf.Canvas.Parser.Data;
using iText.Kernel.Pdf.Canvas.Parser.Listener;
class Program
{
static void Main(string[] args)
{
// Replace the paths with your own file
string inputPdfPath = "input.pdf";
string outputPdfPath = "output.pdf";
// set up PDF
PdfReader reader = new(inputPdfPath);
PdfWriter writer = new(outputPdfPath);
PdfDocument pdfDocument = new PdfDocument(reader, writer);
pdfDocument.SetTagged();
// Use your own values here!
Color find = new DeviceRgb(229, 18, 18);
Color replacement = new DeviceRgb(0, 0, 0);
// alternatively: new DeviceCmyk, new DeviceGray
// Create a new editor
PdfCanvasEditor editor = new(find, replacement);
// Replace every page
for (int i = 1; i <= pdfDocument.GetNumberOfPages(); i++)
{
editor.EditPage(pdfDocument, i);
}
// Save changes to output file
pdfDocument.Close();
}
}
public class PdfCanvasEditor : PdfCanvasProcessor
{
/// <summary>
/// The color of content in the document
/// </summary>
private Color currentColor;
/// <summary>
/// Color specified by the user to be replaced
/// </summary>
private Color colorToFind;
private Color replacementColor;
/// <summary>
/// These operators may indicate that a color change has occurred
/// </summary>
private List<string> TEXT_SHOWING_OPERATORS = new() { "Tj", "'", "\"", "TJ" };
/// <summary>
/// Holds output canvas and related resources
/// </summary>
protected PdfCanvas canvas = null;
public PdfCanvasEditor(iText.Kernel.Colors.Color find, iText.Kernel.Colors.Color replace) : base(new ContentListener()) {
this.colorToFind = find;
this.replacementColor = replace;
}
/// <summary>
/// Edits a page by a given number
/// </summary>
public void EditPage(PdfDocument pdfDocument, int pageNumber)
{
if ((pdfDocument.GetReader() == null) || (pdfDocument.GetWriter() == null))
{
throw new PdfException("PdfDocument must be editable");
}
// Get the current page and resources
PdfPage page = pdfDocument.GetPage(pageNumber);
PdfResources pdfResources = page.GetResources();
// Create a new canvas to make changes on
PdfCanvas pdfCanvas = new PdfCanvas(new PdfStream(), pdfResources, pdfDocument);
// Make the changes
EditContent(page.GetContentBytes(), pdfResources, pdfCanvas);
// Overwrites the old page with the new data
page.Put(PdfName.Contents, pdfCanvas.GetContentStream());
}
/// <summary>
/// Edits content for a page with the binary data
/// </summary>
public void EditContent(byte[] contentBytes, PdfResources resources, PdfCanvas canvas)
{
this.canvas = canvas;
ProcessContent(contentBytes, resources);
this.canvas = null;
}
/// <summary>
/// Copies a page to the "new" document.
/// </summary>
public void Write(PdfCanvasProcessor processor, PdfLiteral pdfOperator, IList<PdfObject> operands)
{
PdfOutputStream pdfOutputStream = canvas.GetContentStream().GetOutputStream();
int index = 0;
// Copy each object to the new page
foreach (PdfObject obj in operands)
{
pdfOutputStream.Write(obj);
if (operands.Count > ++index) {
pdfOutputStream.WriteSpace();
} else {
pdfOutputStream.WriteNewLine();
}
}
}
/// <summary>
/// Modifies a given color if it is found
/// </summary>
public void Write(PdfCanvasProcessor processor, PdfLiteral pdfLiteral, IList<PdfObject> operands, bool isColor)
{
Console.WriteLine("Processing PDF data");
string operatorString = pdfLiteral.ToString();
// Only check a color if there is a related operator
if (TEXT_SHOWING_OPERATORS.Contains(operatorString))
{
if (currentColor == null)
{
// Gets the color of the content
Color currentFillColor = GetGraphicsState().GetFillColor();
// If the color matches, start the replacement process
if (colorToFind.Equals(currentFillColor))
{
// Set the current color
currentColor = currentFillColor;
// Replace it
List<PdfObject> list = GetColorList(replacementColor);
Write(processor, new PdfLiteral("rg"), list);
}
}
}
else if (currentColor != null)
{
if (currentColor is DeviceCmyk)
{
List<PdfObject> list = GetColorList(replacementColor);
Write(processor, new PdfLiteral("k"), list);
}
else if (currentColor is DeviceGray)
{
List<PdfObject> list = GetColorList(replacementColor);
Write(processor, new PdfLiteral("g"), list);
}
else
{
List<PdfObject> list = GetColorList(replacementColor);
Write(processor, new PdfLiteral("rg"), list);
}
// Reset. Allows for more colors to be replaced
currentColor = null;
}
Write(processor, pdfLiteral, operands);
}
/// <summary>
/// Overrides PdfContentStreamProcessor methods
/// </summary>
public override IContentOperator RegisterContentOperator(string operatorString, IContentOperator pdfOperator)
{
ContentOperatorWrapper wrapper = new ContentOperatorWrapper(this);
wrapper.setOriginalOperator(pdfOperator);
IContentOperator formerOperator = base.RegisterContentOperator(operatorString, wrapper);
if (formerOperator is ContentOperatorWrapper)
{
return ((ContentOperatorWrapper)formerOperator).getOriginalOperator();
}
else
{
return formerOperator;
}
}
/// <summary>
/// Gets a color list based on the color space
/// </summary>
private List<PdfObject> GetColorList(Color color)
{
List<PdfObject> list = new();
float[] values = color.GetColorValue();
if (color is DeviceCmyk)
{
list.Add(new PdfNumber(values[0]));
list.Add(new PdfNumber(values[1]));
list.Add(new PdfNumber(values[2]));
list.Add(new PdfNumber(values[3]));
list.Add(new PdfLiteral("k"));
}
else if (color is DeviceGray)
{
list.Add(new PdfNumber(values[0]));
list.Add(new PdfLiteral("g"));
}
else
{
list.Add(new PdfNumber(values[0]));
list.Add(new PdfNumber(values[1]));
list.Add(new PdfNumber(values[2]));
list.Add(new PdfLiteral("rg"));
}
return list;
}
/// <summary>
/// A content operator class to wrap all content operators to forward the invocation to the editor
/// </summary>
class ContentOperatorWrapper : IContentOperator
{
private PdfCanvasEditor editor;
private IContentOperator originalOperator = null;
public ContentOperatorWrapper(PdfCanvasEditor editor)
{
this.editor = editor;
}
public IContentOperator getOriginalOperator()
{
return originalOperator;
}
public void setOriginalOperator(IContentOperator originalOperator)
{
this.originalOperator = originalOperator;
}
public void Invoke(PdfCanvasProcessor processor, PdfLiteral pdfLiteral, IList<PdfObject> operands)
{
if (originalOperator != null && !"Do".Equals(pdfLiteral.ToString()))
{
originalOperator.Invoke(processor, pdfLiteral, operands);
}
editor.Write(processor, pdfLiteral, operands, true);
}
}
/// <summary>
/// A listener which will emit when an event that may have colored text occurs
/// </summary>
class ContentListener : IEventListener
{
public void EventOccurred(IEventData data, EventType type) { }
ICollection<EventType> IEventListener.GetSupportedEvents()
{
return new Collection<EventType>() {
EventType.BEGIN_TEXT, EventType.RENDER_TEXT, EventType.END_TEXT, EventType.RENDER_IMAGE,
EventType.RENDER_PATH, EventType.CLIP_PATH_CHANGED
};
}
}
}
评论