2017-01-13 11 views
0

私は私のpdfファイルにマークしたテキストを取得したいと思います。 PdfPagePdfAnnotationを反復処理します。この注釈にはgetRectangle()というメソッドがあり、これはPdfArrayを返します。私は、PdfArrayから注釈のマークされたテキスト上に位置とオーバーレイを持つランタイムクラス(オブジェクト/インスタンス)Rectangleを作成することはできません。注釈(PdfTextMarkupAnnotation)から矩形を作成して強調表示されたコンテンツを抽出しますか?

注釈のRectangleを使用すると、マークされたコンテンツをLocationtextExtratctionStrategy経由でフィルタリングすることができます。

Marked text in PDF document

私はiTextのとそれを得るために、次のコードを書か:

からのみマーク(ハイライト)テキストを抽出するためにマークされた領域に一致する四角形を作成するための方法
package biz.hochguertel; 

import com.itextpdf.kernel.color.DeviceCmyk; 
import com.itextpdf.kernel.events.Event; 
import com.itextpdf.kernel.events.IEventHandler; 
import com.itextpdf.kernel.events.PdfDocumentEvent; 
import com.itextpdf.kernel.font.PdfFont; 
import com.itextpdf.kernel.geom.Rectangle; 
import com.itextpdf.kernel.pdf.*; 
import com.itextpdf.kernel.pdf.annot.PdfAnnotation; 
import com.itextpdf.kernel.pdf.annot.PdfTextMarkupAnnotation; 
import com.itextpdf.kernel.pdf.canvas.PdfCanvas; 
import com.itextpdf.kernel.pdf.canvas.parser.EventType; 
import com.itextpdf.kernel.pdf.canvas.parser.PdfCanvasProcessor; 
import com.itextpdf.kernel.pdf.canvas.parser.data.IEventData; 
import com.itextpdf.kernel.pdf.canvas.parser.data.TextRenderInfo; 
import com.itextpdf.kernel.pdf.canvas.parser.filter.TextRegionEventFilter; 
import com.itextpdf.kernel.pdf.canvas.parser.listener.FilteredEventListener; 
import com.itextpdf.kernel.pdf.canvas.parser.listener.LocationTextExtractionStrategy; 

import java.io.File; 
import java.io.IOException; 
import java.util.List; 

public class AppIText { 

    private String filePath = getClass().getClassLoader().getResource("itext/OCA/549_OCA_Java_SE_7_Programmer_I_Certification.pdf").getFile(); 
    private static String DEST = "demo-output/549_OCA_Java_SE_7_Programmer_I_Certification.pdf"; 
    private PdfDocument pdfDocument; 
    private PdfDocument pdfWriteDoc; 

    public void before() throws IOException { 
     File file = new File(DEST); 
     file.getParentFile().mkdir(); 
     if (file.exists()) { 
      file.delete(); 
     } 
     pdfDocument = new PdfDocument(new PdfReader(filePath)); 
     pdfWriteDoc = new PdfDocument(new PdfWriter(DEST)); 
    } 

    public static void main(String[] args) throws IOException { 
     AppIText appIText = new AppIText(); 
     appIText.before(); 
     appIText.process(); 
     appIText.close(); 
    } 

    private void close() { 
     pdfDocument.close(); 
     pdfWriteDoc.close(); 
    } 

    private void process() { 
     for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++) { 
      PdfPage page = pdfDocument.getPage(i); 

      List<PdfPage> newPdfPages = pdfDocument.copyPagesTo(i, i, pdfWriteDoc); 
      PdfPage newPage = null; 
      if (newPdfPages.size() > 0) { 
       newPage = newPdfPages.get(0); 
      } 

      List<PdfAnnotation> annotations = page.getAnnotations(); 
      for (PdfAnnotation annotation : annotations) { 
       if (annotation.getContents() != null) { 
        System.out.println(annotation.getContents()); 
        if (annotation instanceof PdfTextMarkupAnnotation) { 
         PdfArray rectangleArray = annotation.getRectangle(); 
         double x = ((PdfNumber) rectangleArray.get(0)).getValue(); 
         double y = ((PdfNumber) rectangleArray.get(1)).getValue(); 
         double xWidth = ((PdfNumber) rectangleArray.get(2)).getValue(); 
         double yWidth = ((PdfNumber) rectangleArray.get(3)).getValue(); 
         System.out.println(String.format("x=%s,y=%s,w=%s,h=%s", x, y, xWidth, yWidth)); 
         Rectangle rectangle = new Rectangle((float) x, (float) y, (float) xWidth, (float) yWidth); 

         PdfCanvas canvas = new PdfCanvas(newPage); 
         canvas.setFillColor(new DeviceCmyk(1, 0, 0, 0)) 
           .rectangle(rectangle) 
           .fillStroke() 
         ; 

         FontFilter fontFilter = new FontFilter(rectangle); 
         FilteredEventListener listener = new FilteredEventListener(); 
         LocationTextExtractionStrategy extractionStrategy = listener.attachEventListener(new LocationTextExtractionStrategy(), fontFilter); 
         new PdfCanvasProcessor(listener).processPageContent(page); 
         String actualText = extractionStrategy.getResultantText(); 
        } 
       } 
      } 
     } 
    } 

} 

class RectangleEventHandler implements IEventHandler { 
    @Override 
    public void handleEvent(Event event) { 
     PdfDocumentEvent docEvent = (PdfDocumentEvent) event; 
     PdfDocument pdfDoc = docEvent.getDocument(); 
     PdfPage page = docEvent.getPage(); 
     PdfCanvas canvas = new PdfCanvas(page.getLastContentStream(), page.getResources(), pdfDoc); 
     canvas.setFillColor(new DeviceCmyk(1, 0, 0, 0)) 
       .rectangle(new Rectangle(20, 10, 10, 820)) 
       .fillStroke(); 
    } 
} 

class FontFilter extends TextRegionEventFilter { 
    public FontFilter(Rectangle filterRect) { 
     super(filterRect); 
    } 

    @Override 
    public boolean accept(IEventData data, EventType type) { 
     if (type.equals(EventType.RENDER_TEXT)) { 
      TextRenderInfo renderInfo = (TextRenderInfo) data; 

      PdfFont font = renderInfo.getFont(); 
      if (null != font) { 
       String fontName = font.getFontProgram().getFontNames().getFontName(); 
       return fontName.endsWith("Bold") || fontName.endsWith("Oblique"); 
      } 
     } 
     return false; 
    } 
} 
  • pdf?
  • または、注釈付きのテキストをpdfから取得する別の方法はありますか?

上記のコードの次の主要部分は、適用することである。

Rectangle rectangle = new Rectangle((float) x, (float) y, (float) xWidth, (float) yWidth);

答えて

0

私は解決策を見つけました。

Rectancle計算:
X:annotation.x
Y:annotation.y
幅:annotation.width - annotation.x
高さ:annotation.height - annotation.y

私は今、何を得る:

ビジュアルデバッグ(LOG_LEVELの場合> = 100):

enter image description here

抽出したコンテンツ:

13:50:01.323 [main] INFO b.h.AppIText - Annotation contents: q(7.1).explain(1) 
13:50:01.323 [main] INFO b.h.AppIText - rectangleArray: x=90.0338, y=438.245, w=468.33, h=489.749 
13:50:01.323 [main] INFO b.h.AppIText - pageSizeWithRotation: x=0.0, y=0.0, w=531.0, h=666.0, top=666.0, bottom=0.0, left=0.0, right=531.0 
13:50:01.337 [main] INFO b.h.AppIText - str: Purpose: A finally block can’t be placed before the catch blocks. <cut here because the book is not free, but I get the complete marked text..> 

私の固定コードが今になります

package biz.hochguertel; 

import com.itextpdf.kernel.color.DeviceCmyk; 
import com.itextpdf.kernel.geom.Rectangle; 
import com.itextpdf.kernel.pdf.*; 
import com.itextpdf.kernel.pdf.annot.PdfAnnotation; 
import com.itextpdf.kernel.pdf.annot.PdfTextMarkupAnnotation; 
import com.itextpdf.kernel.pdf.canvas.PdfCanvas; 
import com.itextpdf.kernel.pdf.canvas.parser.PdfTextExtractor; 
import com.itextpdf.kernel.pdf.canvas.parser.filter.TextRegionEventFilter; 
import com.itextpdf.kernel.pdf.canvas.parser.listener.FilteredTextEventListener; 
import com.itextpdf.kernel.pdf.canvas.parser.listener.ITextExtractionStrategy; 
import com.itextpdf.kernel.pdf.canvas.parser.listener.LocationTextExtractionStrategy; 
import org.slf4j.Logger; 
import org.slf4j.LoggerFactory; 

import java.io.File; 
import java.io.IOException; 
import java.util.List; 


/** 
* With the help of the following documentations: 
* - http://developers.itextpdf.com/content/best-itext-questions-stackoverview/content-parsing-extraction-and-redaction-text/itext7-how-read-text-specific-position 
*/ 
public class AppIText { 

    private static final Logger LOGGER = LoggerFactory.getLogger(AppIText.class); 
    private static int LOG_LEVEL = 0; 
    private final static int VISUAL_DEBUG = 100; 

    private String filePath = getClass().getClassLoader().getResource("itext/OCA/393-394,549-550_OCA_Java_SE_7_Programmer_I_Certification.pdf").getFile(); 
    private static String DEST = "demo-output/393-394,549-550_OCA_Java_SE_7_Programmer_I_Certification.pdf"; 
    private PdfDocument pdfDocument; 
    private PdfDocument pdfWriteDoc; 


    public void before() throws IOException { 
     File file = new File(DEST); 
     file.getParentFile().mkdir(); 
     if (file.exists()) { 
      file.delete(); 
     } 
     pdfDocument = new PdfDocument(new PdfReader(filePath)); 
     pdfWriteDoc = new PdfDocument(new PdfWriter(DEST)); 
    } 

    public static void main(String[] args) throws IOException { 
     AppIText appIText = new AppIText(); 
     appIText.before(); 
     appIText.process(); 
    } 

    private void process() { 
     for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++) { 
      PdfPage page = pdfDocument.getPage(i); 

      List<PdfPage> newPdfPages = pdfDocument.copyPagesTo(i, i, pdfWriteDoc); 
      PdfPage newPage = null; 
      if (newPdfPages.size() > 0) { 
       newPage = newPdfPages.get(0); 
      } 

      List<PdfAnnotation> annotations = page.getAnnotations(); 
      for (PdfAnnotation annotation : annotations) { 
       if (annotation.getContents() != null) { 
        System.out.println(); 
        LOGGER.info("Annotation contents: {}", annotation.getContents()); 
        if (annotation instanceof PdfTextMarkupAnnotation) { 
         PdfArray rectangleArray = annotation.getRectangle(); 
         LOGGER.info("rectangleArray: x={}, y={}, w={}, h={}", 
           rectangleArray.get(0), 
           rectangleArray.get(1), 
           rectangleArray.get(2), 
           rectangleArray.get(3) 
         ); 
         Rectangle pageSizeWithRotation = page.getCropBox(); 
         LOGGER.info("pageSizeWithRotation: x={}, y={}, w={}, h={}, top={}, bottom={}, left={}, right={}", 
           pageSizeWithRotation.getX(), 
           pageSizeWithRotation.getY(), 
           pageSizeWithRotation.getWidth(), 
           pageSizeWithRotation.getHeight(), 
           pageSizeWithRotation.getTop(), 
           pageSizeWithRotation.getBottom(), 
           pageSizeWithRotation.getLeft(), 
           pageSizeWithRotation.getRight() 
         ); 
         float x = ((PdfNumber) rectangleArray.get(0)).floatValue(); 
         float y = ((PdfNumber) rectangleArray.get(1)).floatValue(); 
         float width = ((PdfNumber) rectangleArray.get(2)).floatValue() - x; 
         float height = ((PdfNumber) rectangleArray.get(3)).floatValue() - y; 
         Rectangle rectangle = new Rectangle(
                    x, 
                    y, 
                    width, 
                    height 
         ); 
         //13:10:33.097 [main] INFO b.h.AppIText - Annotation contents: q(7.1).explain(1) 
         //13:10:33.107 [main] INFO b.h.AppIText - rectangleArray: x=90.0338, y=438.245, w=468.33, h=489.749 
         //13:10:33.107 [main] INFO b.h.AppIText - pageSizeWithRotation: x=0.0, y=0.0, w=531.0, h=666.0, top=666.0, bottom=0.0, left=0.0, right=531.0 
         //width: 468.33f - 90.0388f, 
         //height: 489.749f - 438.245f 

         if (LOG_LEVEL >= VISUAL_DEBUG) { 
          PdfCanvas canvas = new PdfCanvas(newPage); 
          canvas.setFillColor(new DeviceCmyk(1, 0, 0, 0)) 
            .rectangle(rectangle) 
            .fillStroke(); 
         } 

         TextRegionEventFilter regionFilter = new TextRegionEventFilter(rectangle); 
         ITextExtractionStrategy strategy = new FilteredTextEventListener(new LocationTextExtractionStrategy(), regionFilter); 
         String str = PdfTextExtractor.getTextFromPage(page, strategy) + "\n"; 
         LOGGER.info("str: {}", str); 
        } 
       } 
      } 
     } 
     pdfDocument.close(); 
     pdfWriteDoc.close(); 
    } 

} 
関連する問題