Quantcast
Channel: Recent Gists from 84adam
Viewing all articles
Browse latest Browse all 35

Extract text from PDF files

$
0
0
pdf_text.py
frompdfminer.pdfinterpimportPDFResourceManager, PDFPageInterpreter
frompdfminer.converterimportTextConverter
frompdfminer.layoutimportLAParams
frompdfminer.pdfpageimportPDFPage
fromioimportStringIO
importos
defconvert_pdf_to_txt(path, pages=None):
ifnotpages:
pagenums=set()
else:
pagenums=set(pages)
output=StringIO()
manager=PDFResourceManager()
converter=TextConverter(manager, output, laparams=LAParams())
interpreter=PDFPageInterpreter(manager, converter)
infile=open(path, 'rb')
forpageinPDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text=output.getvalue()
output.close()
returntext
if__name__=='__main__':
filename=input("Enter name of PDF file from which to extract text: ")
output=convert_pdf_to_txt(filename)
print(output)

Viewing all articles
Browse latest Browse all 35

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>