import os # workaround: install old version of pytorch since detectron2 hasn't released packages for pytorch 1.9 (issue: https://github.com/facebookresearch/detectron2/issues/3158) # os.system('pip install torch==1.8.0+cu101 torchvision==0.9.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html') os.system('pip install -q torch==1.10.0+cu111 torchvision==0.11+cu111 -f https://download.pytorch.org/whl/torch_stable.html') # install detectron2 that matches pytorch 1.8 # See https://detectron2.readthedocs.io/tutorials/install.html for instructions #os.system('pip install -q detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html') os.system('pip install git+https://github.com/facebookresearch/detectron2.git') import detectron2 from detectron2.utils.logger import setup_logger setup_logger() import gradio as gr import re import string from operator import itemgetter import collections import pypdf from pypdf import PdfReader from pypdf.errors import PdfReadError import pypdfium2 as pdfium import langdetect from langdetect import detect_langs import pandas as pd import numpy as np import random import tempfile import itertools from matplotlib import font_manager from PIL import Image, ImageDraw, ImageFont import cv2 ## files import sys sys.path.insert(0, 'files/') import functions from functions import * # update pip os.system('python -m pip install --upgrade pip') ## model / feature extractor / tokenizer model_id_lilt = "pierreguillou/lilt-xlm-roberta-base-finetuned-with-DocLayNet-base-at-paragraphlevel-ml512" model_id1 = model_id_lilt model_id_layoutxlm = "pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-paragraphlevel-ml512" model_id2 = model_id_layoutxlm # tokenizer for LayoutXLM tokenizer_id_layoutxlm = "xlm-roberta-base" # get device import torch device = torch.device("cuda" if torch.cuda.is_available() else "cpu") ## model LiLT import transformers from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer_lilt = AutoTokenizer.from_pretrained(model_id_lilt) model_lilt = AutoModelForTokenClassification.from_pretrained(model_id_lilt); model_lilt.to(device); tokenizer1 = tokenizer_lilt model1 = model_lilt ## model LayoutXLM from transformers import LayoutLMv2ForTokenClassification # LayoutXLMTokenizerFast, model_layoutxlm = LayoutLMv2ForTokenClassification.from_pretrained(model_id_layoutxlm); model_layoutxlm.to(device); # feature extractor from transformers import LayoutLMv2FeatureExtractor feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False) # tokenizer from transformers import AutoTokenizer tokenizer_layoutxlm = AutoTokenizer.from_pretrained(tokenizer_id_layoutxlm) tokenizer2 = tokenizer_layoutxlm model2 = model_layoutxlm # get labels id2label = model_lilt.config.id2label label2id = model_lilt.config.label2id num_labels = len(id2label) # APP outputs by model def app_outputs(uploaded_pdf): filename, msg, images = pdf_to_images(uploaded_pdf) num_images = len(images) if not msg.startswith("Error with the PDF"): # Extraction of image data (text and bounding boxes) dataset, texts_lines, texts_pars, texts_lines_par, row_indexes, par_boxes, line_boxes, lines_par_boxes = extraction_data_from_image(images) # prepare our data in the format of the model # model1 prepare_inference_features_partial1 = partial(prepare_inference_features_paragraph, tokenizer=tokenizer1, max_length=max_length, cls_box=cls_box1, sep_box=sep_box1) encoded_dataset1 = dataset.map(prepare_inference_features_partial1, batched=True, batch_size=64, remove_columns=dataset.column_names) custom_encoded_dataset1 = CustomDataset(encoded_dataset1, tokenizer1) # model2 prepare_inference_features_partial2 = partial(prepare_inference_features_paragraph, tokenizer=tokenizer2, max_length=max_length, cls_box=cls_box2, sep_box=sep_box2) encoded_dataset2 = dataset.map(prepare_inference_features_partial2, batched=True, batch_size=64, remove_columns=dataset.column_names) custom_encoded_dataset2 = CustomDataset(encoded_dataset2, tokenizer2) # Get predictions (token level) # model1 outputs1, images_ids_list1, chunk_ids1, input_ids1, bboxes1 = predictions_token_level(images, custom_encoded_dataset1, model_id1, model1) # model2 outputs2, images_ids_list2, chunk_ids2, input_ids2, bboxes2 = predictions_token_level(images, custom_encoded_dataset2, model_id2, model2) # Get predictions (paragraph level) bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = predictions_paragraph_level(max_length, tokenizer1, id2label, dataset, outputs1, images_ids_list1, chunk_ids1, input_ids1, bboxes1, cls_box1, sep_box1, tokenizer2, outputs2, images_ids_list2, chunk_ids2, input_ids2, bboxes2, cls_box2, sep_box2) # Get labeled images with lines bounding boxes images = get_labeled_images(id2label, dataset, images_ids_list1, bboxes_list_dict, probs_dict_dict) img_files = list() # get image of PDF without bounding boxes for i in range(num_images): if filename != "files/blank.png": img_file = f"img_{i}_" + filename.replace(".pdf", ".png") else: img_file = filename.replace(".pdf", ".png") img_file = img_file.replace("/", "_") images[i].save(img_file) img_files.append(img_file) if num_images < max_imgboxes: img_files += [image_blank]*(max_imgboxes - num_images) images += [Image.open(image_blank)]*(max_imgboxes - num_images) for count in range(max_imgboxes - num_images): df[num_images + count] = pd.DataFrame() else: img_files = img_files[:max_imgboxes] images = images[:max_imgboxes] df = dict(itertools.islice(df.items(), max_imgboxes)) # save csv_files = list() for i in range(max_imgboxes): csv_file = f"csv_{i}_" + filename.replace(".pdf", ".csv") csv_file = csv_file.replace("/", "_") csv_files.append(gr.File.update(value=csv_file, visible=True)) df[i].to_csv(csv_file, encoding="utf-8", index=False) else: img_files, images, csv_files = [""]*max_imgboxes, [""]*max_imgboxes, [""]*max_imgboxes img_files[0], img_files[1] = image_blank, image_blank images[0], images[1] = Image.open(image_blank), Image.open(image_blank) csv_file = "csv_wo_content.csv" csv_files[0], csv_files[1] = gr.File.update(value=csv_file, visible=True), gr.File.update(value=csv_file, visible=True) df, df_empty = dict(), pd.DataFrame() df[0], df[1] = df_empty.to_csv(csv_file, encoding="utf-8", index=False), df_empty.to_csv(csv_file, encoding="utf-8", index=False) return msg, img_files[0], img_files[1], images[0], images[1], csv_files[0], csv_files[1], df[0], df[1] # Gradio APP with gr.Blocks(title='Inference APP for Document Understanding at paragraph level (v3 - Ensemble "LiLT + LayoutXLM" base)', css=".gradio-container") as demo: gr.HTML("""

Inference APP for Document Understanding at paragraph level (v3 - Ensemble "LiLT + LayoutXLM" base)

(04/04/2023) This Inference APP uses an ensemble of 2 Document Understanding models finetuned on the dataset DocLayNet base at paragraph level (chunk size of 512 tokens) and combined with XLM-RoBERTa base: LiLT base and LayoutXLM base.

This ensemble calculates the probabilities of each block from the outputs of the models for each label before selecting the label with the highest sum of the normalized probabilities.

Note: LiLT (Language-Independent Layout Transformer) and LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding are Document Understanding models that use both layout and text in order to detect labels of bounding boxes. Combined with the model XML-RoBERTa base, this finetuned model has the capacity to understand any language. Finetuned on the dataset DocLayNet base, they can classifly any bounding box (and its OCR text) to 11 labels (Caption, Footnote, Formula, List-item, Page-footer, Page-header, Picture, Section-header, Table, Text, Title).

They rely on an external OCR engine to get words and bounding boxes from the document image. Thus, let's run in this APP an OCR engine (PyTesseract) to get the bounding boxes, then run the 2 models (already fine-tuned on the dataset DocLayNet base at paragraph level) on the individual tokens and then, normalized the sum of block probabilities as explained, and visualize the result at paragraph level!

It allows to get all pages of any PDF (of any language) with bounding boxes labeled at paragraph level and the associated dataframes with labeled data (bounding boxes, texts, labels) :-)

However, the inference time per page can be high when running the model on CPU due to the number of paragraph predictions to be made. Therefore, to avoid running this APP for too long, only the first 2 pages are processed by this APP. If you want to increase this limit, you can either clone this APP in Hugging Face Space (or run its notebook on your own plateform) and change the value of the parameter max_imgboxes, or run the inference notebook "Document AI | Inference at paragraph level by using the association of 2 Document Understanding models (LiLT and LayoutXLM base fine-tuned on DocLayNet base dataset)" on your own platform as it does not have this limit.

Links to Document Understanding APPs:

More information about the DocLayNet datasets, the finetuning of the model and this APP in the following blog posts:

""") with gr.Row(): pdf_file = gr.File(label="PDF") with gr.Row(): submit_btn = gr.Button(f"Display first {max_imgboxes} labeled PDF pages") reset_btn = gr.Button(value="Clear") with gr.Row(): output_msg = gr.Textbox(label="Output message") with gr.Row(): fileboxes = [] for num_page in range(max_imgboxes): file_path = gr.File(visible=True, label=f"Image file of the PDF page n°{num_page}") fileboxes.append(file_path) with gr.Row(): imgboxes = [] for num_page in range(max_imgboxes): img = gr.Image(type="pil", label=f"Image of the PDF page n°{num_page}") imgboxes.append(img) with gr.Row(): csvboxes = [] for num_page in range(max_imgboxes): csv = gr.File(visible=True, label=f"CSV file at paragraph level (page {num_page})") csvboxes.append(csv) with gr.Row(): dfboxes = [] for num_page in range(max_imgboxes): df = gr.Dataframe( headers=["bounding boxes", "texts", "labels"], datatype=["str", "str", "str"], col_count=(3, "fixed"), visible=True, label=f"Data of page {num_page}", type="pandas", wrap=True ) dfboxes.append(df) outputboxes = [output_msg] + fileboxes + imgboxes + csvboxes + dfboxes submit_btn.click(app_outputs, inputs=[pdf_file], outputs=outputboxes) # https://github.com/gradio-app/gradio/pull/2044/files#diff-a91dd2749f68bb7d0099a0f4079a4fd2d10281e299e7b451cb1bb876a7c21975R91 reset_btn.click( lambda: [pdf_file.update(value=None), output_msg.update(value=None)] + [filebox.update(value=None) for filebox in fileboxes] + [imgbox.update(value=None) for imgbox in imgboxes] + [csvbox.update(value=None) for csvbox in csvboxes] + [dfbox.update(value=None) for dfbox in dfboxes], inputs=[], outputs=[pdf_file, output_msg] + fileboxes + imgboxes + csvboxes + dfboxes ) gr.Examples( [["files/example.pdf"]], [pdf_file], outputboxes, fn=app_outputs, cache_examples=True, ) demo.launch()