2016-04-27 33 views
3

へのPDFは、私は、Pythonに新しいもの、CSVファイルにPDFファイルを変換できるようにしたいといくつかの便利なスクリプトを発見したけど、私は疑問を持っていますPDFと印刷するCSVファイルPythonのPDFMInerはCSV

私は、Python 2.7.11とPDFMiner 20140328.ここ

import sys 
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter 
from pdfminer.pdfpage import PDFPage 
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter 
from pdfminer.layout import LAParams 
from cStringIO import StringIO 

def pdfparser(data): 

    fp = file(data, 'rb') 
    rsrcmgr = PDFResourceManager() 
    retstr = StringIO() 
    codec = 'utf-8' 
    laparams = LAParams() 
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) 
    interpreter = PDFPageInterpreter(rsrcmgr, device) 


    for page in PDFPage.get_pages(fp): 
    interpreter.process_page(page) 
    data = retstr.getvalue() 

    print data 

if __name__ == '__main__': 
pdfparser(sys.argv[1]) 
+0

1.入力は、コマンドライン上にあります。 2.このコードはファイルには印刷せず、単に*: 'print data'を出力します。私は 'python yourScriptName.py input.pdf> output.csv'のようなものが欲しいと思います。しかし、*最初の*あなたはいくつかのインデントエラーを修正するか、ソースを正しくコピーしたことを確認したいでしょう。 – usr2564301

答えて

5

を使用していますがSO tgrayで答弁thisからいくつか変更されたコードです:

def pdf_to_csv(filename, separator, threshold): 
    from cStringIO import StringIO 
    from pdfminer.converter import LTChar, TextConverter 
    from pdfminer.layout import LAParams 
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter 
    from pdfminer.pdfpage import PDFPage 

    class CsvConverter(TextConverter): 
     def __init__(self, *args, **kwargs): 
      TextConverter.__init__(self, *args, **kwargs) 
      self.separator = separator 
      self.threshold = threshold 

     def end_page(self, i): 
      from collections import defaultdict 
      lines = defaultdict(lambda: {}) 
      for child in self.cur_item._objs: # <-- changed 
       if isinstance(child, LTChar): 
        (_, _, x, y) = child.bbox 
        line = lines[int(-y)] 
        line[x] = child._text.encode(self.codec) # <-- changed 
      for y in sorted(lines.keys()): 
       line = lines[y] 
       self.line_creator(line) 
       self.outfp.write(self.line_creator(line)) 
       self.outfp.write("\n") 

     def line_creator(self, line): 
      keys = sorted(line.keys()) 
      # calculate the average distange between each character on this row 
      average_distance = sum([keys[i] - keys[i - 1] for i in range(1, len(keys))])/len(keys) 
      # append the first character to the result 
      result = [line[keys[0]]] 
      for i in range(1, len(keys)): 
       # if the distance between this character and the last character is greater than the average*threshold 
       if (keys[i] - keys[i - 1]) > average_distance * self.threshold: 
        # append the separator into that position 
        result.append(self.separator) 
       # append the character 
       result.append(line[keys[i]]) 
      printable_line = ''.join(result) 
      return printable_line 

    # ... the following part of the code is a remix of the 
    # convert() function in the pdfminer/tools/pdf2text module 
    rsrc = PDFResourceManager() 
    outfp = StringIO() 
    device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams()) 
    # becuase my test documents are utf-8 (note: utf-8 is the default codec) 

    fp = open(filename, 'rb') 

    interpreter = PDFPageInterpreter(rsrc, device) 
    for i, page in enumerate(PDFPage.get_pages(fp)): 
     outfp.write("START PAGE %d\n" % i) 
     if page is not None: 
      print 'none' 
      interpreter.process_page(page) 
     outfp.write("END PAGE %d\n" % i) 

    device.close() 
    fp.close() 

    return outfp.getvalue() 


if __name__ == '__main__': 
    # the separator to use with the CSV 
    separator = ';' 
    # the distance multiplier after which a character is considered part of a new word/column/block. Usually 1.5 works quite well 
    threshold = 1.5 
    print pdf_to_csv('myLovelyFile.pdf', separator, threshold) 

答えの主な違いでリンクとこれはline_creatorメソッドで、PDFからいくつかの構造体を抽出しようとします。 `pdfparser(sys.argvの[1])`:

するPDFminer 20140328.で動作するはず