Quantcast
Viewing latest article 8
Browse Latest Browse All 34

Extract text from a PDF given its URL

pdf_url_text.py
importrequests
frompdfminer.pdfinterpimportPDFResourceManager, PDFPageInterpreter
frompdfminer.converterimportTextConverter
frompdfminer.layoutimportLAParams
frompdfminer.pdfpageimportPDFPage
fromioimportStringIO, BytesIO
defconvert_pdf_to_txt(url, pages=None):
ifnotpages:
pagenums=set()
else:
pagenums=set(pages)
output=StringIO()
manager=PDFResourceManager()
converter=TextConverter(manager, output, laparams=LAParams())
interpreter=PDFPageInterpreter(manager, converter)
r=requests.get(url)
infile=BytesIO(r.content)
forpageinPDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text=output.getvalue()
output.close()
returntext
if__name__=='__main__':
url=input("Enter URL of PDF from which to extract text: ")
# Example URL: https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf
# Output:
# >>> Dummy PDF file
output=convert_pdf_to_txt(url)
print(output)

Viewing latest article 8
Browse Latest Browse All 34

Trending Articles