diff --git a/.gitignore b/.gitignore index e69de29..ebaeab5 100644 --- a/.gitignore +++ b/.gitignore @@ -0,0 +1,3 @@ +datasets/ +output/ +**__pycache__** \ No newline at end of file diff --git a/README.md b/README.md index 49b86f8..11b403a 100644 --- a/README.md +++ b/README.md @@ -1,26 +1,79 @@ -# Document Layout Analysis -> Tools for extract figure, table, text,... from a pdf document +# Document-Layout-Analysis +Tools for extract figure, table, text,... from a pdf document +## Installation +``` +$ pip install -r requirements.txt +``` +### Install detectron2 +Requirment +- CUDA=10.1 +- PyTorch>=1.7.0 +How to install CUDA 10.1 can be found here: https://developer.nvidia.com/cuda-10.1-download-archive-base -## About The Project +How to install PyTorch can be found here: https://pytorch.org/ -![](./image/demo.png) +Afer installed above package, follow the instructions below to install detectron2: +``` +$ git clone https://github.com/facebookresearch/detectron2.git +$ git checkout 8e3effc +$ python -m pip install -e detectron2 +``` +### Install Document-Layout-Analysis +Follow the instructions below: +``` +$ git clone -b dev https://github.com/Wild-Rift/Document-Layout-Analysis.git +$ cd Document-Layout-Analysis +``` -## Installation +## Train +### Dataset -See [Installed](https://github.com/Wild-Rift/Document-Layout-Analysis/tree/staging). +We use [IBM Publaynet](https://developer.ibm.com/technologies/artificial-intelligence/data/publaynet/) dataset for training and testing. -## Training model +It includes 358,353 images, 335,703 training images, 11,245 validation images and 11,405 test images. The category-id label mapping of this dataset is: +| Category id | Label | +| :---: | :--- | +| 1 | Text | +| 2 | Title | +| 3 | List | +| 4 | Table | +| 5 | Figure | -See [Train model](https://github.com/Wild-Rift/Document-Layout-Analysis/tree/dev) +After download and extract dataset, please put it in ```datasets``` directory. The directories should be arranged like this: -## Evalutate model + root + ├── mmdet + ├── tools + ├── configs + ├── output + │ ├──... + │ + ├── datasets + │ ├── publaynet + │ │ ├── test/ + │ │ ├── train/ + │ │ ├── val/ + │ │ ├── train.json + │ │ ├── val.json -See [Colab Notebook](https://colab.research.google.com/drive/1WBzVAgLdldrX6Gs1lbUaUPF63fkjcw4t?usp=sharing) +### Training +Document-Layout-Analysis support training on two models: Faster-RCNN and Mask-RCNN -## My Team. +``` +$ CONFIG_FILE='configs/faster_rcnn_R_101_FPN_3x.yaml' # if use Faster-RCNN model +$ CONFIG_FILE='configs/mask_rcnn_R_101_FPN_3x.yaml' # if use Mask-RCNN model +``` +If you want to inspect model's structures, go to ```configs``` directory -1. [Pham Van Tien](https://github.com/vietnamican) -2. [Nguyen Trung Duc](https://github.com/caoboiyb) -3. [Tran Tien Quan](https://github.com/Lill98) -4. [Bui Xuan Thoai](https://github.com/ThanThoai) +If you want to training on 8 GPU, run: +``` +$ python train.py --num-gpus 8 --config-file CONFIG_FILE +``` +If you want to training on 1 GPU, you may need to [change some parameters](https://arxiv.org/abs/1706.02677), run: +``` +$ python train.py --num-gpus 1 \ + --config-file CONFIG_FILE \ + SOLVER.IMS_PER_BATCH 2 SOLVER.BASE_LR 0.0025 +``` +Checkpoints of model will be store in ```output``` directory after each epoch. \ No newline at end of file diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/configs/Base-RCNN-FPN.yaml b/configs/Base-RCNN-FPN.yaml new file mode 100644 index 0000000..d40fe5e --- /dev/null +++ b/configs/Base-RCNN-FPN.yaml @@ -0,0 +1,42 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + BACKBONE: + NAME: "build_resnet_fpn_backbone" + RESNETS: + OUT_FEATURES: ["res2", "res3", "res4", "res5"] + FPN: + IN_FEATURES: ["res2", "res3", "res4", "res5"] + ANCHOR_GENERATOR: + SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map + ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps) + RPN: + IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"] + PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level + PRE_NMS_TOPK_TEST: 1000 # Per FPN level + # Detectron1 uses 2000 proposals per-batch, + # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue) + # which is approximately 1000 proposals per-image since the default batch size for FPN is 2. + POST_NMS_TOPK_TRAIN: 1000 + POST_NMS_TOPK_TEST: 1000 + ROI_HEADS: + NAME: "StandardROIHeads" + IN_FEATURES: ["p2", "p3", "p4", "p5"] + ROI_BOX_HEAD: + NAME: "FastRCNNConvFCHead" + NUM_FC: 2 + POOLER_RESOLUTION: 7 + ROI_MASK_HEAD: + NAME: "MaskRCNNConvUpsampleHead" + NUM_CONV: 4 + POOLER_RESOLUTION: 14 +DATASETS: + TRAIN: ("coco_2017_train",) + TEST: ("coco_2017_val",) +SOLVER: + IMS_PER_BATCH: 16 + BASE_LR: 0.02 + STEPS: (60000, 80000) + MAX_ITER: 90000 +INPUT: + MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) +VERSION: 2 \ No newline at end of file diff --git a/configs/faster_rcnn_R_101_FPN_3x.yaml b/configs/faster_rcnn_R_101_FPN_3x.yaml new file mode 100644 index 0000000..28d7f07 --- /dev/null +++ b/configs/faster_rcnn_R_101_FPN_3x.yaml @@ -0,0 +1,18 @@ +_BASE_: "Base-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" + MASK_ON: False + RESNETS: + DEPTH: 101 + ROI_HEADS: + BATCH_SIZE_PER_IMAGE: 512 +DATASETS: + TRAIN: ("publaynet_train",) + TEST: ("publaynet_val",) +SOLVER: + STEPS: (210000, 250000) + MAX_ITER: 75500 + IMS_PER_BATCH: 2 + BASE_LR: 0.0009 +DATALOADER: + NUM_WORKERS: 1 diff --git a/configs/mask_rcnn_R_101_FPN_3x.yaml b/configs/mask_rcnn_R_101_FPN_3x.yaml new file mode 100644 index 0000000..d88a25a --- /dev/null +++ b/configs/mask_rcnn_R_101_FPN_3x.yaml @@ -0,0 +1,18 @@ +_BASE_: "Base-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x/138205316/model_final_a3ec72.pkl" + MASK_ON: True + RESNETS: + DEPTH: 101 + ROI_HEADS: + BATCH_SIZE_PER_IMAGE: 512 +DATASETS: + TRAIN: ("publaynet_train",) + TEST: ("publaynet_val",) +SOLVER: + STEPS: (210000, 250000) + MAX_ITER: 75500 + IMS_PER_BATCH: 2 + BASE_LR: 0.0009 +DATALOADER: + NUM_WORKERS: 1 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c405636 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +numpy=1.9.0 +pytorch=1.7.0 +torchvision=0.8.1 +pyyaml==5.1 \ No newline at end of file diff --git a/tools/__init__.py b/tools/__init__.py new file mode 100644 index 0000000..66824d3 --- /dev/null +++ b/tools/__init__.py @@ -0,0 +1 @@ +from .preprocess import register_publaynet_dataset \ No newline at end of file diff --git a/tools/preprocess.py b/tools/preprocess.py new file mode 100644 index 0000000..9dd8797 --- /dev/null +++ b/tools/preprocess.py @@ -0,0 +1,27 @@ +from detectron2.data.datasets import register_coco_instances + +PUBLAYNET_CATEGORIES = [ + {"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "Background"}, + {"color": [119, 11, 32], "isthing": 1, "id": 2, "name": "Text"}, + {"color": [0, 0, 142], "isthing": 1, "id": 3, "name": "Title"}, + {"color": [0, 0, 230], "isthing": 1, "id": 4, "name": "List"}, + {"color": [106, 0, 228], "isthing": 1, "id": 5, "name": "Table"}, + {"color": [0, 60, 100], "isthing": 1, "id": 6, "name": "Figure"}, +] + +def _get_publaynet_instances_meta(): + thing_ids = [k["id"] for k in PUBLAYNET_CATEGORIES if k["isthing"] == 1] + thing_colors = [k["color"] for k in PUBLAYNET_CATEGORIES if k["isthing"] == 1] + assert len(thing_ids) == 6, len(thing_ids) + # Mapping from the incontiguous COCO category id to an id in [0, 79] + thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)} + thing_classes = [k["name"] for k in PUBLAYNET_CATEGORIES if k["isthing"] == 1] + ret = { + "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, + "thing_classes": thing_classes, + "thing_colors": thing_colors, + } + return ret +def register_publaynet_dataset(): + register_coco_instances("publaynet_train", {}, "datasets/publaynet/val.json", "datasets/publaynet/val/") + register_coco_instances("publaynet_val", {}, "datasets/publaynet/val.json", "datasets/publaynet/val/") \ No newline at end of file diff --git a/tools/visualize_json_result.py b/tools/visualize_json_result.py new file mode 100644 index 0000000..ad8c8ed --- /dev/null +++ b/tools/visualize_json_result.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python +# Copyright (c) Facebook, Inc. and its affiliates. + +import argparse +import json +import numpy as np +import os +from collections import defaultdict +import cv2 +import tqdm + +from detectron2.data import DatasetCatalog, MetadataCatalog +from detectron2.structures import Boxes, BoxMode, Instances +from detectron2.utils.file_io import PathManager +from detectron2.utils.logger import setup_logger +from detectron2.utils.visualizer import Visualizer + + +def create_instances(predictions, image_size): + ret = Instances(image_size) + + score = np.asarray([x["score"] for x in predictions]) + chosen = (score > args.conf_threshold).nonzero()[0] + score = score[chosen] + bbox = np.asarray([predictions[i]["bbox"] for i in chosen]).reshape(-1, 4) + bbox = BoxMode.convert(bbox, BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) + + labels = np.asarray([dataset_id_map(predictions[i]["category_id"]) for i in chosen]) + + ret.scores = score + ret.pred_boxes = Boxes(bbox) + ret.pred_classes = labels + + try: + ret.pred_masks = [predictions[i]["segmentation"] for i in chosen] + except KeyError: + pass + return ret + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="A script that visualizes the json predictions from COCO or LVIS dataset." + ) + parser.add_argument("--input", required=True, help="JSON file produced by the model") + parser.add_argument("--output", required=True, help="output directory") + parser.add_argument("--dataset", help="name of the dataset", default="coco_2017_val") + parser.add_argument("--conf-threshold", default=0.5, type=float, help="confidence threshold") + args = parser.parse_args() + + logger = setup_logger() + + with PathManager.open(args.input, "r") as f: + predictions = json.load(f) + + pred_by_image = defaultdict(list) + for p in predictions: + pred_by_image[p["image_id"]].append(p) + + dicts = list(DatasetCatalog.get(args.dataset)) + metadata = MetadataCatalog.get(args.dataset) + if hasattr(metadata, "thing_dataset_id_to_contiguous_id"): + + def dataset_id_map(ds_id): + return metadata.thing_dataset_id_to_contiguous_id[ds_id] + + elif "lvis" in args.dataset: + # LVIS results are in the same format as COCO results, but have a different + # mapping from dataset category id to contiguous category id in [0, #categories - 1] + def dataset_id_map(ds_id): + return ds_id - 1 + + else: + raise ValueError("Unsupported dataset: {}".format(args.dataset)) + + os.makedirs(args.output, exist_ok=True) + + for dic in tqdm.tqdm(dicts): + img = cv2.imread(dic["file_name"], cv2.IMREAD_COLOR)[:, :, ::-1] + basename = os.path.basename(dic["file_name"]) + + predictions = create_instances(pred_by_image[dic["image_id"]], img.shape[:2]) + vis = Visualizer(img, metadata) + vis_pred = vis.draw_instance_predictions(predictions).get_image() + + vis = Visualizer(img, metadata) + vis_gt = vis.draw_dataset_dict(dic).get_image() + + concat = np.concatenate((vis_pred, vis_gt), axis=1) + cv2.imwrite(os.path.join(args.output, basename), concat[:, :, ::-1] \ No newline at end of file diff --git a/train.py b/train.py new file mode 100644 index 0000000..b0538ba --- /dev/null +++ b/train.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python +# Copyright (c) Facebook, Inc. and its affiliates. +""" +Detection Training Script. + +This scripts reads a given config file and runs the training or evaluation. +It is an entry point that is made to train standard models in detectron2. + +In order to let one script support training of many models, +this script contains logic that are specific to these built-in models and therefore +may not be suitable for your own project. +For example, your research project perhaps only needs a single "evaluator". + +Therefore, we recommend you to use detectron2 as an library and take +this file as an example of how to use the library. +You may want to write your own script with your datasets and other customizations. +""" + +import logging +import os +from collections import OrderedDict +import torch + +import detectron2.utils.comm as comm +from detectron2.checkpoint import DetectionCheckpointer +from detectron2.config import get_cfg +from detectron2.data import MetadataCatalog +from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, hooks, launch +from detectron2.evaluation import ( + CityscapesInstanceEvaluator, + CityscapesSemSegEvaluator, + COCOEvaluator, + COCOPanopticEvaluator, + DatasetEvaluators, + LVISEvaluator, + PascalVOCDetectionEvaluator, + SemSegEvaluator, + verify_results, +) +from detectron2.modeling import GeneralizedRCNNWithTTA + +from tools import register_publaynet_dataset + +register_publaynet_dataset() + +class Trainer(DefaultTrainer): + """ + We use the "DefaultTrainer" which contains pre-defined default logic for + standard training workflow. They may not work for you, especially if you + are working on a new research project. In that case you can write your + own training loop. You can use "tools/plain_train_net.py" as an example. + """ + + @classmethod + def build_evaluator(cls, cfg, dataset_name, output_folder=None): + """ + Create evaluator(s) for a given dataset. + This uses the special metadata "evaluator_type" associated with each builtin dataset. + For your own dataset, you can simply create an evaluator manually in your + script and do not have to worry about the hacky if-else logic here. + """ + if output_folder is None: + output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") + evaluator_list = [] + evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type + if evaluator_type in ["sem_seg", "coco_panoptic_seg"]: + evaluator_list.append( + SemSegEvaluator( + dataset_name, + distributed=True, + output_dir=output_folder, + ) + ) + if evaluator_type in ["coco", "coco_panoptic_seg"]: + evaluator_list.append(COCOEvaluator(dataset_name, output_dir=output_folder)) + if evaluator_type == "coco_panoptic_seg": + evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder)) + if evaluator_type == "cityscapes_instance": + assert ( + torch.cuda.device_count() >= comm.get_rank() + ), "CityscapesEvaluator currently do not work with multiple machines." + return CityscapesInstanceEvaluator(dataset_name) + if evaluator_type == "cityscapes_sem_seg": + assert ( + torch.cuda.device_count() >= comm.get_rank() + ), "CityscapesEvaluator currently do not work with multiple machines." + return CityscapesSemSegEvaluator(dataset_name) + elif evaluator_type == "pascal_voc": + return PascalVOCDetectionEvaluator(dataset_name) + elif evaluator_type == "lvis": + return LVISEvaluator(dataset_name, cfg, True, output_folder) + if len(evaluator_list) == 0: + raise NotImplementedError( + "no Evaluator for the dataset {} with the type {}".format( + dataset_name, evaluator_type + ) + ) + elif len(evaluator_list) == 1: + return evaluator_list[0] + return DatasetEvaluators(evaluator_list) + + @classmethod + def test_with_TTA(cls, cfg, model): + logger = logging.getLogger("detectron2.trainer") + # In the end of training, run an evaluation with TTA + # Only support some R-CNN models. + logger.info("Running inference with test-time augmentation ...") + model = GeneralizedRCNNWithTTA(cfg, model) + evaluators = [ + cls.build_evaluator( + cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA") + ) + for name in cfg.DATASETS.TEST + ] + res = cls.test(cfg, model, evaluators) + res = OrderedDict({k + "_TTA": v for k, v in res.items()}) + return res + + +def setup(args): + """ + Create configs and perform basic setups. + """ + cfg = get_cfg() + cfg.merge_from_file(args.config_file) + cfg.merge_from_list(args.opts) + cfg.freeze() + default_setup(cfg, args) + return cfg + + +def main(args): + cfg = setup(args) + + if args.eval_only: + model = Trainer.build_model(cfg) + DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( + cfg.MODEL.WEIGHTS, resume=args.resume + ) + res = Trainer.test(cfg, model) + if cfg.TEST.AUG.ENABLED: + res.update(Trainer.test_with_TTA(cfg, model)) + if comm.is_main_process(): + verify_results(cfg, res) + return res + + """ + If you'd like to do anything fancier than the standard training logic, + consider writing your own training loop (see plain_train_net.py) or + subclassing the trainer. + """ + trainer = Trainer(cfg) + trainer.resume_or_load(resume=args.resume) + if cfg.TEST.AUG.ENABLED: + trainer.register_hooks( + [hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model))] + ) + return trainer.train() + + +if __name__ == "__main__": + + args = default_argument_parser().parse_args() + print("Command Line Args:", args) + launch( + main, + args.num_gpus, + num_machines=args.num_machines, + machine_rank=args.machine_rank, + dist_url=args.dist_url, + args=(args,), + ) \ No newline at end of file