#!python
# Extract text from academic pdfs.
import argparse
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import io
from io import BytesIO

parser = argparse.ArgumentParser(description='Converts academic paper to textfile.')
parser.add_argument('infile', nargs=1,  help="Input file")
parser.add_argument('outfile', nargs=1,  help="Output file")
parser.add_argument('-p', '--separate-pages')
args = parser.parse_args()



def convert_pdf_to_txt(path):
    # Started with stuff from here: https://stackoverflow.com/questions/39854841/pdfminer-python-3-5
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()
    inf = open(args.infile[0], 'rb')
    for page in PDFPage.get_pages(inf, pagenos, maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)
        if args.separate_pages:
            pass
    
    inf.close()
    device.close()
    text = retstr.getvalue()
    retstr.close()
    fixed = []
    for line in text.split('\n'):
        if len(line) < 10:
            continue
        if '©' in line:
            continue
        fixed.append(line)
    return "\n".join(fixed)


out = convert_pdf_to_txt(args.infile)
with open(args.outfile[0],'w') as f:
    f.write(out)
