如何查找具有给定颜色的 PDF 文本并替换它

How to Find PDF Text with a Given Color and Replace it

提问人:Mytho 提问时间:11/2/2023 最后编辑:Mytho 更新时间:11/7/2023 访问量:65

问:

我想查找具有给定颜色的文本,然后将其替换为新颜色。我知道 Acrobat 可以做到这一点,但是在进行低级编辑时会破坏标签树。我想使用 iText 来自动执行此任务。

我的应用程序主要在标签树中工作,用于与可访问性相关的操作。但是,我不认为颜色信息位于那里。我已经遍历了一堆标签,但我没有看到任何颜色。此外,PAC 和 iText Rups 都不会在标签树中显示颜色信息。

我假设我必须转到 PDF 的“较低级别”,但我不确定该怎么做。

我下载了 PDF 标准的副本,发现颜色信息存储在标识符“rg”之后。iText Rups 在内容流中显示以下内容:


EMC
/Standard <</MCID 0 >> BDC
q
0 0 0 rg --------------------- Sets color to black for this word
BT
56.8 724.1 Td
/F1 12 Tf
<0102030405> Tj
ET
Q
EMC

如何在 iText 中访问此流?或者,有没有办法在不达到这个级别的情况下做到这一点?如果可以的话,我更愿意使用 PdfStructs 或标签树中的某些东西。

编辑:在查看了链接的评论后,我想出了这个:


    class Program
    {
        static void Main(string[] args)
        {
            string inputPdfPath = "input.pdf";
            string outputPdfPath = "output.pdf";
    
            // set up PDF
            PdfReader reader = new(inputPdfPath);
            PdfWriter writer = new(outputPdfPath);
    
            PdfDocument pdfDocument = new PdfDocument(reader, writer);
            pdfDocument.SetTagged();

            // Search each page with the processor
            MyProcessor editor = new MyProcessor(new Listener());
            for (int i = 1; i <= pdfDocument.GetNumberOfPages(); i++)
            {
                editor.EditPage(pdfDocument, i);
            }
    
            pdfDocument.Close();
        }
    }
    public class MyProcessor : PdfCanvasProcessor
    {
        public MyProcessor(IEventListener eventListener) : base(eventListener) {}
    
        protected override void EventOccurred(IEventData data, EventType type)
        {
            // Only inspect text, ignore null objects
            if (data != null) {
                if (type is EventType.RENDER_TEXT) {
                    TextRenderInfo txt = (TextRenderInfo)data;
                    Console.WriteLine(txt.GetText());
                    foreach (var color in txt.GetFillColor().GetColorValue())
                    {
                        Console.Write(color + " ");
                    }
                    Console.WriteLine("");
                }
            }
            base.EventOccurred(data, type);
        }
    
        public void EditPage(PdfDocument pdfDocument, int pageNumber) {
            PdfPage page = pdfDocument.GetPage(pageNumber);
            PdfResources pdfResources = page.GetResources();
            PdfCanvas pdfCanvas = new PdfCanvas(new PdfStream(), pdfResources, pdfDocument);
    
            ProcessContent(page.GetContentBytes(), pdfResources);
        }
    }
    
    public class Listener : IEventListener
    {
        public void EventOccurred(IEventData data, EventType type) {}
    
        public ICollection<EventType> GetSupportedEvents()
        {
            return new Collection<EventType>() {
                EventType.BEGIN_TEXT, EventType.RENDER_TEXT, EventType.END_TEXT,
                EventType.RENDER_IMAGE, EventType.RENDER_PATH, EventType.CLIP_PATH_CHANGED
            };
        }
    }

我现在可以找到特定的颜色。我仍然需要弄清楚如何修改现有颜色。不过,看起来我必须重写页面内容,而不是修改现有文档。

C# PDF itext7

评论

0赞 Mytho 11/2/2023
为了简洁起见,我只展示了一个摘录,流中还有更多内容。我看了更多,我发现了你所说的其他颜色变化。
1赞 mkl 11/2/2023
您可能想看看这个答案,它显示了如何使用 iText 7 编辑此类流;特别要考虑标题“文本颜色更改”下的示例用法。该答案包含 iText 7 for Java 的代码,因此您可能必须先将代码移植到 C#。

答:

1赞 Mytho 11/7/2023 #1

在使用链接的 java 示例后,我设法让它工作。下面的代码将采用目标颜色和替换颜色。它将搜索文档,并用替换目标颜色的任何实例。

这应该适用于所有色彩空间。我使用 RGB 作为搜索条件,但如果需要,您可以将其更改为 DeviceCMYK 或 DeviceGray。

using System.Collections.ObjectModel;
using iText.Kernel.Colors;
using iText.Kernel.Exceptions;
using iText.Kernel.Pdf;
using iText.Kernel.Pdf.Canvas;
using iText.Kernel.Pdf.Canvas.Parser;
using iText.Kernel.Pdf.Canvas.Parser.Data;
using iText.Kernel.Pdf.Canvas.Parser.Listener;

class Program
{
    static void Main(string[] args)
    {
        // Replace the paths with your own file
        string inputPdfPath = "input.pdf";
        string outputPdfPath = "output.pdf";

        // set up PDF
        PdfReader reader = new(inputPdfPath);
        PdfWriter writer = new(outputPdfPath);
        PdfDocument pdfDocument = new PdfDocument(reader, writer);
        pdfDocument.SetTagged();

        // Use your own values here!
        Color find = new DeviceRgb(229, 18, 18);
        Color replacement = new DeviceRgb(0, 0, 0);
        // alternatively: new DeviceCmyk, new DeviceGray

        // Create a new editor
        PdfCanvasEditor editor = new(find, replacement);

        // Replace every page
        for (int i = 1; i <= pdfDocument.GetNumberOfPages(); i++)
        {
            editor.EditPage(pdfDocument, i);
        }

        // Save changes to output file
        pdfDocument.Close();
    }
}

public class PdfCanvasEditor : PdfCanvasProcessor
{
    /// <summary> 
    /// The color of content in the document 
    /// </summary>
    private Color currentColor;
    /// <summary> 
    /// Color specified by the user to be replaced 
    /// </summary>
    private Color colorToFind;

    private Color replacementColor;

    /// <summary> 
    /// These operators may indicate that a color change has occurred 
    /// </summary>
    private List<string> TEXT_SHOWING_OPERATORS = new() { "Tj", "'", "\"", "TJ" };

    /// <summary> 
    /// Holds output canvas and related resources 
    /// </summary>
    protected PdfCanvas canvas = null;

    public PdfCanvasEditor(iText.Kernel.Colors.Color find, iText.Kernel.Colors.Color replace) : base(new ContentListener()) {
        this.colorToFind = find;
        this.replacementColor = replace;
    }


    /// <summary>
    /// Edits a page by a given number
    /// </summary>
    public void EditPage(PdfDocument pdfDocument, int pageNumber)
    {
        if ((pdfDocument.GetReader() == null) || (pdfDocument.GetWriter() == null))
        {
            throw new PdfException("PdfDocument must be editable");
        }

        // Get the current page and resources
        PdfPage page = pdfDocument.GetPage(pageNumber);
        PdfResources pdfResources = page.GetResources();

        // Create a new canvas to make changes on
        PdfCanvas pdfCanvas = new PdfCanvas(new PdfStream(), pdfResources, pdfDocument);
        // Make the changes
        EditContent(page.GetContentBytes(), pdfResources, pdfCanvas);

        // Overwrites the old page with the new data
        page.Put(PdfName.Contents, pdfCanvas.GetContentStream());
    }

    /// <summary> 
    /// Edits content for a page with the binary data 
    /// </summary>
    public void EditContent(byte[] contentBytes, PdfResources resources, PdfCanvas canvas)
    {
        this.canvas = canvas;
        ProcessContent(contentBytes, resources);
        this.canvas = null;
    }

    /// <summary> 
    /// Copies a page to the "new" document. 
    /// </summary>
    public void Write(PdfCanvasProcessor processor, PdfLiteral pdfOperator, IList<PdfObject> operands)
    {
        PdfOutputStream pdfOutputStream = canvas.GetContentStream().GetOutputStream();
        int index = 0;

        // Copy each object to the new page
        foreach (PdfObject obj in operands)
        {
            pdfOutputStream.Write(obj);
            if (operands.Count > ++index) {
                pdfOutputStream.WriteSpace();
            } else {
                pdfOutputStream.WriteNewLine();
            }
        }
    }

    /// <summary> 
    /// Modifies a given color if it is found 
    /// </summary>
    public void Write(PdfCanvasProcessor processor, PdfLiteral pdfLiteral, IList<PdfObject> operands, bool isColor)
    {
        Console.WriteLine("Processing PDF data");
        string operatorString = pdfLiteral.ToString();

        // Only check a color if there is a related operator
        if (TEXT_SHOWING_OPERATORS.Contains(operatorString))
        {
            if (currentColor == null)
            {
                // Gets the color of the content
                Color currentFillColor = GetGraphicsState().GetFillColor();
                
                // If the color matches, start the replacement process
                if (colorToFind.Equals(currentFillColor))
                {
                    // Set the current color
                    currentColor = currentFillColor;

                    // Replace it
                    List<PdfObject> list = GetColorList(replacementColor);
                    Write(processor, new PdfLiteral("rg"), list);
                }
            }
        }
        else if (currentColor != null)
        {
            if (currentColor is DeviceCmyk)
            {
                List<PdfObject> list = GetColorList(replacementColor);
                Write(processor, new PdfLiteral("k"), list);
            }
            else if (currentColor is DeviceGray)
            {
                List<PdfObject> list = GetColorList(replacementColor);
                Write(processor, new PdfLiteral("g"), list);
            }
            else
            {
                List<PdfObject> list = GetColorList(replacementColor);
                Write(processor, new PdfLiteral("rg"), list);
            }

            // Reset. Allows for more colors to be replaced
            currentColor = null;
        }

        Write(processor, pdfLiteral, operands);
    }

    /// <summary> 
    /// Overrides PdfContentStreamProcessor methods 
    /// </summary>
    public override IContentOperator RegisterContentOperator(string operatorString, IContentOperator pdfOperator)
    {
        ContentOperatorWrapper wrapper = new ContentOperatorWrapper(this);
        wrapper.setOriginalOperator(pdfOperator);
        IContentOperator formerOperator = base.RegisterContentOperator(operatorString, wrapper);
        if (formerOperator is ContentOperatorWrapper)
        {
            return ((ContentOperatorWrapper)formerOperator).getOriginalOperator();
        }
        else
        {
            return formerOperator;
        }
    }

    /// <summary>
    /// Gets a color list based on the color space
    /// </summary>
    private List<PdfObject> GetColorList(Color color)
    {
        List<PdfObject> list = new();
        float[] values = color.GetColorValue();
        if (color is DeviceCmyk)
        {
            list.Add(new PdfNumber(values[0]));
            list.Add(new PdfNumber(values[1]));
            list.Add(new PdfNumber(values[2]));
            list.Add(new PdfNumber(values[3]));
            list.Add(new PdfLiteral("k"));
        }
        else if (color is DeviceGray)
        {
            list.Add(new PdfNumber(values[0]));
            list.Add(new PdfLiteral("g"));
        }
        else
        {
            list.Add(new PdfNumber(values[0]));
            list.Add(new PdfNumber(values[1]));
            list.Add(new PdfNumber(values[2]));
            list.Add(new PdfLiteral("rg"));
        }

        return list;
    }

    /// <summary>
    /// A content operator class to wrap all content operators to forward the invocation to the editor
    /// </summary>
    class ContentOperatorWrapper : IContentOperator
    {
        private PdfCanvasEditor editor;
        private IContentOperator originalOperator = null;
        
        public ContentOperatorWrapper(PdfCanvasEditor editor)
        {
            this.editor = editor;
        }

        public IContentOperator getOriginalOperator()
        {
            return originalOperator;
        }

        public void setOriginalOperator(IContentOperator originalOperator)
        {
            this.originalOperator = originalOperator;
        }


        public void Invoke(PdfCanvasProcessor processor, PdfLiteral pdfLiteral, IList<PdfObject> operands)
        {
            if (originalOperator != null && !"Do".Equals(pdfLiteral.ToString()))
            {
                originalOperator.Invoke(processor, pdfLiteral, operands);
            }
            editor.Write(processor, pdfLiteral, operands, true);
        }
    }

    /// <summary> 
    /// A listener which will emit when an event that may have colored text occurs 
    /// </summary>
    class ContentListener : IEventListener
    {
        public void EventOccurred(IEventData data, EventType type) { }

        ICollection<EventType> IEventListener.GetSupportedEvents()
        {
            return new Collection<EventType>() {
                EventType.BEGIN_TEXT, EventType.RENDER_TEXT, EventType.END_TEXT, EventType.RENDER_IMAGE,
                EventType.RENDER_PATH, EventType.CLIP_PATH_CHANGED
            };
        }
    }
}