lxml
nltk

[pdf]
layoutparser[layoutmodels,tesseract]
