diff --git a/doctr/datasets/datasets/base.py b/doctr/datasets/datasets/base.py index 08d5a2eedd..3f3be4ac6f 100644 --- a/doctr/datasets/datasets/base.py +++ b/doctr/datasets/datasets/base.py @@ -5,6 +5,7 @@ import os import shutil +import traceback from collections.abc import Callable from pathlib import Path from typing import Any @@ -47,28 +48,37 @@ def _read_sample(self, index: int) -> tuple[Any, Any]: def __getitem__(self, index: int) -> tuple[Any, Any]: # Read image - img, target = self._read_sample(index) - # Pre-transforms (format conversion at run-time etc.) - if self._pre_transforms is not None: - img, target = self._pre_transforms(img, target) - - if self.img_transforms is not None: - # typing issue cf. https://github.com/python/mypy/issues/5485 - img = self.img_transforms(img) - - if self.sample_transforms is not None: - # Conditions to assess it is detection model with multiple classes and avoid confusion with other tasks. - if ( - isinstance(target, dict) - and all(isinstance(item, np.ndarray) for item in target.values()) - and set(target.keys()) != {"boxes", "labels"} # avoid confusion with obj detection target - ): - img_transformed = _copy_tensor(img) - for class_name, bboxes in target.items(): - img_transformed, target[class_name] = self.sample_transforms(img, bboxes) - img = img_transformed - else: - img, target = self.sample_transforms(img, target) + try: + img, target = self._read_sample(index) + # Pre-transforms (format conversion at run-time etc.) + if self._pre_transforms is not None: + img, target = self._pre_transforms(img, target) + + if self.img_transforms is not None: + # typing issue cf. https://github.com/python/mypy/issues/5485 + img = self.img_transforms(img) + + if self.sample_transforms is not None: + # Conditions to assess it is detection model with multiple classes and avoid confusion with other tasks. + if ( + isinstance(target, dict) + and all(isinstance(item, np.ndarray) for item in target.values()) + and set(target.keys()) != {"boxes", "labels"} # avoid confusion with obj detection target + ): + img_transformed = _copy_tensor(img) + for class_name, bboxes in target.items(): + img_transformed, target[class_name] = self.sample_transforms(img, bboxes) + img = img_transformed + else: + img, target = self.sample_transforms(img, target) + except Exception: + img_name = self.data[index][0] + # Write + print() + print(f"!!!ERROR in Dataset on filename {img_name}") + traceback.print_exc() + print() + return self.__getitem__(0) # should exists ^^ return img, target diff --git a/doctr/datasets/detection.py b/doctr/datasets/detection.py index 50fb7f6893..d9cab82ff4 100644 --- a/doctr/datasets/detection.py +++ b/doctr/datasets/detection.py @@ -54,14 +54,20 @@ def __init__( self.data: list[tuple[str, tuple[np.ndarray, list[str]]]] = [] np_dtype = np.float32 + missing_files = [] for img_name, label in labels.items(): # File existence check if not os.path.exists(os.path.join(self.root, img_name)): - raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}") - - geoms, polygons_classes = self.format_polygons(label["polygons"], use_polygons, np_dtype) - - self.data.append((img_name, (np.asarray(geoms, dtype=np_dtype), polygons_classes))) + missing_files.append(img_name) + # raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}") + else: + geoms, polygons_classes = self.format_polygons(label["polygons"], use_polygons, np_dtype) + self.data.append((img_name, (np.asarray(geoms, dtype=np_dtype), polygons_classes))) + print("List of missing files:") + print(f"MISSING FILES: {len(missing_files)}") + from pprint import pprint + + pprint(missing_files) def format_polygons( self, polygons: list | dict, use_polygons: bool, np_dtype: type diff --git a/doctr/datasets/recognition.py b/doctr/datasets/recognition.py index f5cfddf78a..8a20ce1711 100644 --- a/doctr/datasets/recognition.py +++ b/doctr/datasets/recognition.py @@ -39,11 +39,18 @@ def __init__( with open(labels_path, encoding="utf-8") as f: labels = json.load(f) + missing_files = [] for img_name, label in labels.items(): if not os.path.exists(os.path.join(self.root, img_name)): - raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}") - - self.data.append((img_name, label)) + missing_files.append(img_name) + # raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}") + else: + self.data.append((img_name, label)) + print("List of missing files:") + print(f"MISSING FILES: {len(missing_files)}") + from pprint import pprint + + pprint(missing_files) def merge_dataset(self, ds: AbstractDataset) -> None: # Update data with new root for self diff --git a/references/classification/train_pytorch_orientation.py b/references/classification/train_pytorch_orientation.py index aff149dea9..d44d4f668f 100644 --- a/references/classification/train_pytorch_orientation.py +++ b/references/classification/train_pytorch_orientation.py @@ -34,6 +34,33 @@ from doctr.models.utils import export_model_to_onnx from utils import EarlyStopper, plot_recorder, plot_samples +SLACK_WEBHOOK_URL = None +SLACK_WEBHOOK_PATH = Path(os.path.join(os.path.expanduser("~"), ".config", "doctr", "slack_webhook_url.txt")) +if SLACK_WEBHOOK_PATH.exists(): + with open(SLACK_WEBHOOK_PATH) as f: + SLACK_WEBHOOK_URL = f.read().strip() +else: + print(f"{SLACK_WEBHOOK_PATH} does not exist, skip Slack integration configuration...") + + +def send_on_slack(text: str): + """Send a message on Slack. + + Args: + text (str): message to send on Slack + """ + if SLACK_WEBHOOK_URL: + try: + import requests + + requests.post( + url=SLACK_WEBHOOK_URL, + json={"text": f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]: {text}"}, + ) + except Exception: + print("Impossible to send message on Slack, continue...") + + CLASSES = [0, -90, 180, 90] @@ -123,7 +150,10 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a model.train() # Iterate over the batches of the dataset + last_progress = 0 + interval_progress = 5 pbar = tqdm(train_loader, position=1) + send_on_slack(str(pbar)) for images, targets in pbar: if torch.cuda.is_available(): images = images.cuda() @@ -148,15 +178,24 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a scheduler.step() pbar.set_description(f"Training loss: {train_loss.item():.6}") + current_progress = pbar.n / pbar.total * 100 + if current_progress - last_progress > interval_progress: + send_on_slack(str(pbar)) + last_progress = int(current_progress) + send_on_slack(f"Final training loss: {train_loss.item():.6}") @torch.no_grad() def evaluate(model, val_loader, batch_transforms, amp=False): # Model in eval mode model.eval() + last_progress = 0 + interval_progress = 5 + pbar = tqdm(val_loader) + send_on_slack(str(pbar)) # Validation loop val_loss, correct, samples, batch_cnt = 0.0, 0.0, 0.0, 0.0 - for images, targets in tqdm(val_loader): + for images, targets in pbar: images = batch_transforms(images) if torch.cuda.is_available(): @@ -177,6 +216,11 @@ def evaluate(model, val_loader, batch_transforms, amp=False): batch_cnt += 1 samples += images.shape[0] + current_progress = pbar.n / pbar.total * 100 + if current_progress - last_progress > interval_progress: + send_on_slack(str(pbar)) + last_progress = int(current_progress) + val_loss /= batch_cnt acc = correct / samples return val_loss, acc @@ -184,6 +228,7 @@ def evaluate(model, val_loader, batch_transforms, amp=False): def main(args): print(args) + send_on_slack(f"Start training: {args}") if args.push_to_hub: login_to_hub() @@ -216,6 +261,9 @@ def main(args): pin_memory=torch.cuda.is_available(), ) print(f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {len(val_loader)} batches)") + send_on_slack( + f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {len(val_loader)} batches)" + ) batch_transforms = Normalize(mean=(0.694, 0.695, 0.693), std=(0.299, 0.296, 0.301)) @@ -225,6 +273,7 @@ def main(args): # Resume weights if isinstance(args.resume, str): print(f"Resuming {args.resume}") + send_on_slack(f"Resuming {args.resume}") checkpoint = torch.load(args.resume, map_location="cpu") model.load_state_dict(checkpoint) @@ -278,6 +327,9 @@ def main(args): pin_memory=torch.cuda.is_available(), ) print(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches)") + send_on_slack( + f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches)" + ) if args.show_samples: x, target = next(iter(train_loader)) @@ -354,9 +406,11 @@ def main(args): val_loss, acc = evaluate(model, val_loader, batch_transforms) if val_loss < min_loss: print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") + send_on_slack(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") torch.save(model.state_dict(), Path(args.output_dir) / f"{exp_name}.pt") min_loss = val_loss print(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})") + send_on_slack(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})") # W&B if args.wb: wandb.log({ @@ -365,6 +419,7 @@ def main(args): }) if args.early_stop and early_stopper.early_stop(val_loss): print("Training halted early due to reaching patience limit.") + send_on_slack("Training halted early due to reaching patience limit.") break if args.wb: run.finish() diff --git a/references/classification/train_tensorflow_orientation.py b/references/classification/train_tensorflow_orientation.py index 9a6cb947d9..ff6aa37b97 100644 --- a/references/classification/train_tensorflow_orientation.py +++ b/references/classification/train_tensorflow_orientation.py @@ -34,6 +34,33 @@ from doctr.transforms.functional import rotated_img_tensor from utils import EarlyStopper, plot_recorder, plot_samples +SLACK_WEBHOOK_URL = None +SLACK_WEBHOOK_PATH = Path(os.path.join(os.path.expanduser("~"), ".config", "doctr", "slack_webhook_url.txt")) +if SLACK_WEBHOOK_PATH.exists(): + with open(SLACK_WEBHOOK_PATH) as f: + SLACK_WEBHOOK_URL = f.read().strip() +else: + print(f"{SLACK_WEBHOOK_PATH} does not exist, skip Slack integration configuration...") + + +def send_on_slack(text: str): + """Send a message on Slack. + + Args: + text (str): message to send on Slack + """ + if SLACK_WEBHOOK_URL: + try: + import requests + + requests.post( + url=SLACK_WEBHOOK_URL, + json={"text": f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]: {text}"}, + ) + except Exception: + print("Impossible to send message on Slack, continue...") + + CLASSES = [0, -90, 180, 90] @@ -108,7 +135,10 @@ def apply_grads(optimizer, grads, model): def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False): # Iterate over the batches of the dataset + last_progress = 0 + interval_progress = 5 pbar = tqdm(train_loader, position=1) + send_on_slack(str(pbar)) for images, targets in pbar: images = batch_transforms(images) @@ -121,13 +151,22 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False): apply_grads(optimizer, grads, model) pbar.set_description(f"Training loss: {train_loss.numpy().mean():.6}") + current_progress = pbar.n / pbar.total * 100 + if current_progress - last_progress > interval_progress: + send_on_slack(str(pbar)) + last_progress = int(current_progress) + send_on_slack(f"Final training loss: {train_loss.numpy().mean():.6}") def evaluate(model, val_loader, batch_transforms): # Validation loop + last_progress = 0 + interval_progress = 5 val_loss, correct, samples, batch_cnt = 0.0, 0.0, 0.0, 0.0 val_iter = iter(val_loader) - for images, targets in tqdm(val_iter): + pbar = tqdm(val_iter) + send_on_slack(str(pbar)) + for images, targets in pbar: images = batch_transforms(images) out = model(images, training=False) loss = tf.nn.sparse_softmax_cross_entropy_with_logits(targets, out) @@ -138,6 +177,11 @@ def evaluate(model, val_loader, batch_transforms): batch_cnt += 1 samples += images.shape[0] + current_progress = pbar.n / pbar.total * 100 + if current_progress - last_progress > interval_progress: + send_on_slack(str(pbar)) + last_progress = int(current_progress) + val_loss /= batch_cnt acc = correct / samples return val_loss, acc @@ -152,6 +196,7 @@ def collate_fn(samples): def main(args): print(args) + send_on_slack(f"Start training: {args}") if args.push_to_hub: login_to_hub() @@ -184,6 +229,9 @@ def main(args): print( f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {val_loader.num_batches} batches)" ) + send_on_slack( + f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {val_loader.num_batches} batches)" + ) # Load doctr model model = classification.__dict__[args.arch]( @@ -238,6 +286,9 @@ def main(args): print( f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {train_loader.num_batches} batches)" ) + send_on_slack( + f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {train_loader.num_batches} batches)" + ) if args.show_samples: x, target = next(iter(train_loader)) @@ -334,9 +385,11 @@ def main(args): val_loss, acc = evaluate(model, val_loader, batch_transforms) if val_loss < min_loss: print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") + send_on_slack(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") model.save_weights(Path(args.output_dir) / f"{exp_name}.weights.h5") min_loss = val_loss print(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})") + send_on_slack(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})") # W&B if args.wb: wandb.log({ @@ -353,6 +406,7 @@ def main(args): logger.report_scalar(title="Accuracy", series="acc", value=acc, iteration=epoch) if args.early_stop and early_stopper.early_stop(val_loss): print("Training halted early due to reaching patience limit.") + send_on_slack("Training halted early due to reaching patience limit.") break if args.wb: run.finish() diff --git a/references/classification/train_tensorflow_orientation_from_torch.py b/references/classification/train_tensorflow_orientation_from_torch.py new file mode 100644 index 0000000000..a4dec1286a --- /dev/null +++ b/references/classification/train_tensorflow_orientation_from_torch.py @@ -0,0 +1,471 @@ +# Copyright (C) 2021-2024, Mindee. + +# This program is licensed under the Apache License 2.0. +# See LICENSE or go to for full license details. + +import os + +os.environ["USE_TORCH"] = "1" + +import datetime +import logging +import multiprocessing as mp +import time +from pathlib import Path + +import numpy as np +import torch +import wandb +from torch.nn.functional import cross_entropy +from torch.optim.lr_scheduler import CosineAnnealingLR, MultiplicativeLR, OneCycleLR +from torch.utils.data import DataLoader, RandomSampler, SequentialSampler +from torchvision.transforms import functional as F +from torchvision.transforms.v2 import ( + Compose, + GaussianBlur, + Normalize, + RandomGrayscale, + RandomPerspective, + RandomPhotometricDistort, +) +from tqdm.auto import tqdm + +from doctr import transforms as T +from doctr.datasets import OrientationDataset +from doctr.models import classification, login_to_hub, push_to_hf_hub +from doctr.models.utils import export_model_to_onnx +from utils import EarlyStopper, plot_recorder, plot_samples + +SLACK_WEBHOOK_URL = None +SLACK_WEBHOOK_PATH = Path(os.path.join(os.path.expanduser("~"), ".config", "doctr", "slack_webhook_url.txt")) +if SLACK_WEBHOOK_PATH.exists(): + with open(SLACK_WEBHOOK_PATH) as f: + SLACK_WEBHOOK_URL = f.read().strip() +else: + print(f"{SLACK_WEBHOOK_PATH} does not exist, skip Slack integration configuration...") + + +def send_on_slack(text: str): + """Send a message on Slack. + + Args: + text (str): message to send on Slack + """ + if SLACK_WEBHOOK_URL: + try: + import requests + + requests.post( + url=SLACK_WEBHOOK_URL, + json={"text": f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]: {text}"}, + ) + except Exception: + print("Impossible to send message on Slack, continue...") + + +CLASSES = [0, -90, 180, 90] + + +def rnd_rotate(img: torch.Tensor, target): + angle = int(np.random.choice(CLASSES)) + idx = CLASSES.index(angle) + # augment the angle randomly with a probability of 0.5 + if np.random.rand() < 0.5: + angle += float(np.random.choice(np.arange(-25, 25, 5))) + rotated_img = F.rotate(img, angle=-angle, fill=0, expand=angle not in CLASSES)[:3] + return rotated_img, idx + + +def record_lr( + model: torch.nn.Module, + train_loader: DataLoader, + batch_transforms, + optimizer, + start_lr: float = 1e-7, + end_lr: float = 1, + num_it: int = 100, + amp: bool = False, +): + """Gridsearch the optimal learning rate for the training. + Adapted from https://github.com/frgfm/Holocron/blob/master/holocron/trainer/core.py + """ + if num_it > len(train_loader): + raise ValueError("the value of `num_it` needs to be lower than the number of available batches") + + model = model.train() + # Update param groups & LR + optimizer.defaults["lr"] = start_lr + for pgroup in optimizer.param_groups: + pgroup["lr"] = start_lr + + gamma = (end_lr / start_lr) ** (1 / (num_it - 1)) + scheduler = MultiplicativeLR(optimizer, lambda step: gamma) + + lr_recorder = [start_lr * gamma**idx for idx in range(num_it)] + loss_recorder = [] + + if amp: + scaler = torch.cuda.amp.GradScaler() + + for batch_idx, (images, targets) in enumerate(train_loader): + if torch.cuda.is_available(): + images = images.cuda() + targets = targets.cuda() + + images = batch_transforms(images) + + # Forward, Backward & update + optimizer.zero_grad() + if amp: + with torch.cuda.amp.autocast(): + out = model(images) + train_loss = cross_entropy(out, targets) + scaler.scale(train_loss).backward() + # Update the params + scaler.step(optimizer) + scaler.update() + else: + out = model(images) + train_loss = cross_entropy(out, targets) + train_loss.backward() + optimizer.step() + # Update LR + scheduler.step() + + # Record + if not torch.isfinite(train_loss): + if batch_idx == 0: + raise ValueError("loss value is NaN or inf.") + else: + break + loss_recorder.append(train_loss.item()) + # Stop after the number of iterations + if batch_idx + 1 == num_it: + break + + return lr_recorder[: len(loss_recorder)], loss_recorder + + +def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False): + # Iterate over the batches of the dataset + last_progress = 0 + interval_progress = 5 + pbar = tqdm(train_loader, position=1) + send_on_slack(str(pbar)) + import tensorflow as tf + for images, targets in pbar: + images = batch_transforms(images) + + images = tf.convert_to_tensor(images) + images = tf.transpose(images, (0, 3, 2, 1)) + with tf.GradientTape() as tape: + out = model(images, training=True) + train_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(targets, out) + grads = tape.gradient(train_loss, model.trainable_weights) + if amp: + grads = optimizer.get_unscaled_gradients(grads) + optimizer.apply_gradients(zip(grads, model.trainable_weights)) + + pbar.set_description(f"Training loss: {train_loss.numpy().mean():.6}") + current_progress = pbar.n / pbar.total * 100 + if current_progress - last_progress > interval_progress: + send_on_slack(str(pbar)) + last_progress = int(current_progress) + send_on_slack(str(pbar)) + #send_on_slack(f"Final training loss: {train_loss.item():.6}") + + +def evaluate(model, val_loader, batch_transforms): + # Validation loop + last_progress = 0 + interval_progress = 5 + val_loss, correct, samples, batch_cnt = 0.0, 0.0, 0.0, 0.0 + val_iter = iter(val_loader) + pbar = tqdm(val_iter) + send_on_slack(str(pbar)) + import tensorflow as tf + for images, targets in pbar: + images = batch_transforms(images) + images = tf.convert_to_tensor(images) + images = tf.transpose(images, (0, 3, 2, 1)) + out = model(images, training=False) + loss = tf.nn.sparse_softmax_cross_entropy_with_logits(targets, out) + # Compute metric + correct += int((out.numpy().argmax(1) == targets.numpy()).sum()) + + val_loss += loss.numpy().mean() + batch_cnt += 1 + samples += images.shape[0] + + current_progress = pbar.n / pbar.total * 100 + if current_progress - last_progress > interval_progress: + send_on_slack(str(pbar)) + last_progress = int(current_progress) + + val_loss /= batch_cnt + acc = correct / samples + return val_loss, acc + + +def main(args): + print(args) + send_on_slack(f"Start training: {args}") + + if args.push_to_hub: + login_to_hub() + + if not isinstance(args.workers, int): + args.workers = min(16, mp.cpu_count()) + + torch.backends.cudnn.benchmark = True + + input_size = (512, 512) if args.type == "page" else (256, 256) + + # Load val data generator + st = time.time() + val_set = OrientationDataset( + img_folder=os.path.join(args.val_path, "images"), + img_transforms=Compose([ + T.Resize(input_size, preserve_aspect_ratio=True, symmetric_pad=True), + ]), + sample_transforms=T.SampleCompose([ + lambda x, y: rnd_rotate(x, y), + T.Resize(input_size), + ]), + ) + val_loader = DataLoader( + val_set, + batch_size=args.batch_size, + drop_last=False, + num_workers=args.workers, + sampler=SequentialSampler(val_set), + pin_memory=torch.cuda.is_available(), + ) + print(f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in " f"{len(val_loader)} batches)") + send_on_slack( + f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in " f"{len(val_loader)} batches)" + ) + + batch_transforms = Normalize(mean=(0.694, 0.695, 0.693), std=(0.299, 0.296, 0.301)) + + # Load doctr model + import doctr.models.classification.mobilenet.tensorflow as classification_tf + model = classification_tf.__dict__[args.arch](pretrained=args.pretrained, num_classes=len(CLASSES), classes=CLASSES) + + # Resume weights + if isinstance(args.resume, str): + print(f"Resuming {args.resume}") + send_on_slack(f"Resuming {args.resume}") + checkpoint = torch.load(args.resume, map_location="cpu") + model.load_state_dict(checkpoint) + + # GPU + #if isinstance(args.device, int): + # if not torch.cuda.is_available(): + # raise AssertionError("PyTorch cannot access your GPU. Please investigate!") + # if args.device >= torch.cuda.device_count(): + # raise ValueError("Invalid device index") + ## Silent default switch to GPU if available + #elif torch.cuda.is_available(): + # args.device = 0 + #else: + # logging.warning("No accessible GPU, targe device set to CPU.") + #if torch.cuda.is_available(): + # torch.cuda.set_device(args.device) + # model = model.cuda() + + if args.test_only: + print("Running evaluation") + val_loss, acc = evaluate(model, val_loader, batch_transforms) + print(f"Validation loss: {val_loss:.6} (Acc: {acc:.2%})") + return + + st = time.time() + train_set = OrientationDataset( + img_folder=os.path.join(args.train_path, "images"), + img_transforms=Compose([ + T.Resize(input_size, preserve_aspect_ratio=True, symmetric_pad=True), + # Augmentations + T.RandomApply(T.ColorInversion(), 0.1), + T.RandomApply(T.GaussianNoise(mean=0.1, std=0.1), 0.1), + T.RandomApply(T.RandomShadow(), 0.2), + T.RandomApply(GaussianBlur(kernel_size=3), 0.1), + RandomPhotometricDistort(p=0.1), + RandomGrayscale(p=0.1), + RandomPerspective(distortion_scale=0.1, p=0.3), + ]), + sample_transforms=T.SampleCompose([ + lambda x, y: rnd_rotate(x, y), + T.Resize(input_size), + ]), + ) + + train_loader = DataLoader( + train_set, + batch_size=args.batch_size, + drop_last=True, + num_workers=args.workers, + sampler=RandomSampler(train_set), + pin_memory=torch.cuda.is_available(), + ) + print(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in " f"{len(train_loader)} batches)") + send_on_slack( + f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in " f"{len(train_loader)} batches)" + ) + + if args.show_samples: + x, target = next(iter(train_loader)) + plot_samples(x, [CLASSES[t] for t in target]) + return + + # Optimizer + #optimizer = torch.optim.Adam( + # [p for p in model.parameters() if p.requires_grad], + # args.lr, + # betas=(0.95, 0.99), + # eps=1e-6, + # weight_decay=args.weight_decay, + #) + import tensorflow as tf + scheduler = tf.keras.optimizers.schedules.ExponentialDecay( + args.lr, + decay_steps=args.epochs * len(train_loader), + decay_rate=1 / (1e3), # final lr as a fraction of initial lr + staircase=False, + name="ExponentialDecay", + ) + optimizer = tf.keras.optimizers.Adam( + learning_rate=scheduler, + beta_1=0.95, + beta_2=0.99, + epsilon=1e-6, + ) + + # LR Finder + if args.find_lr: + lrs, losses = record_lr(model, train_loader, batch_transforms, optimizer, amp=args.amp) + plot_recorder(lrs, losses) + return + ## Scheduler + #if args.sched == "cosine": + # scheduler = CosineAnnealingLR(optimizer, args.epochs * len(train_loader), eta_min=args.lr / 25e4) + #elif args.sched == "onecycle": + # scheduler = OneCycleLR(optimizer, args.lr, args.epochs * len(train_loader)) + + # Training monitoring + current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + exp_name = f"{args.arch}_{current_time}" if args.name is None else args.name + + # W&B + if args.wb: + run = wandb.init( + name=exp_name, + project="orientation-classification", + config={ + "learning_rate": args.lr, + "epochs": args.epochs, + "weight_decay": args.weight_decay, + "batch_size": args.batch_size, + "architecture": args.arch, + "input_size": input_size, + "optimizer": "adam", + "framework": "pytorch", + "classes": CLASSES, + "scheduler": args.sched, + "pretrained": args.pretrained, + }, + ) + + # Create loss queue + min_loss = np.inf + # Training loop + if args.early_stop: + early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta) + for epoch in range(args.epochs): + fit_one_epoch(model, train_loader, batch_transforms, optimizer) + model.save_weights(f"./{exp_name}_{epoch}/weights") + + try: + # Validation loop at the end of each epoch + val_loss, acc = evaluate(model, val_loader, batch_transforms) + if val_loss < min_loss: + print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") + send_on_slack(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") + model.save_weights(f"./{exp_name}/weights") + min_loss = val_loss + print(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})") + send_on_slack(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})") + except Exception: + import traceback + traceback.print_exc() + ## W&B + #if args.wb: + # wandb.log({ + # "val_loss": val_loss, + # "acc": acc, + # }) + #if args.early_stop and early_stopper.early_stop(val_loss): + # print("Training halted early due to reaching patience limit.") + # send_on_slack("Training halted early due to reaching patience limit.") + # break + if args.wb: + run.finish() + + if args.push_to_hub: + push_to_hf_hub(model, exp_name, task="classification", run_config=args) + + if args.export_onnx: + print("Exporting model to ONNX...") + dummy_batch = next(iter(val_loader)) + dummy_input = dummy_batch[0].cuda() if torch.cuda.is_available() else dummy_batch[0] + model_path = export_model_to_onnx(model, exp_name, dummy_input) + print(f"Exported model saved in {model_path}") + + +def parse_args(): + import argparse + + parser = argparse.ArgumentParser( + description="DocTR training script for orientation classification (PyTorch)", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + parser.add_argument("train_path", type=str, help="path to training data folder") + parser.add_argument("val_path", type=str, help="path to validation data folder") + parser.add_argument("arch", type=str, help="classification model to train") + parser.add_argument("type", type=str, choices=["page", "crop"], help="type of data to train on") + parser.add_argument("--name", type=str, default=None, help="Name of your training experiment") + parser.add_argument("--epochs", type=int, default=10, help="number of epochs to train the model on") + parser.add_argument("-b", "--batch_size", type=int, default=2, help="batch size for training") + parser.add_argument("--device", default=None, type=int, help="device") + parser.add_argument("--lr", type=float, default=0.001, help="learning rate for the optimizer (Adam)") + parser.add_argument("--wd", "--weight-decay", default=0, type=float, help="weight decay", dest="weight_decay") + parser.add_argument("-j", "--workers", type=int, default=None, help="number of workers used for dataloading") + parser.add_argument("--resume", type=str, default=None, help="Path to your checkpoint") + parser.add_argument("--test-only", dest="test_only", action="store_true", help="Run the validation loop") + parser.add_argument( + "--show-samples", dest="show_samples", action="store_true", help="Display unormalized training samples" + ) + parser.add_argument("--wb", dest="wb", action="store_true", help="Log to Weights & Biases") + parser.add_argument("--push-to-hub", dest="push_to_hub", action="store_true", help="Push to Huggingface Hub") + parser.add_argument( + "--pretrained", + dest="pretrained", + action="store_true", + help="Load pretrained parameters before starting the training", + ) + parser.add_argument("--export-onnx", dest="export_onnx", action="store_true", help="Export the model to ONNX") + parser.add_argument("--sched", type=str, default="cosine", help="scheduler to use") + parser.add_argument("--amp", dest="amp", help="Use Automatic Mixed Precision", action="store_true") + parser.add_argument("--find-lr", action="store_true", help="Gridsearch the optimal LR") + parser.add_argument("--early-stop", action="store_true", help="Enable early stopping") + parser.add_argument("--early-stop-epochs", type=int, default=5, help="Patience for early stopping") + parser.add_argument("--early-stop-delta", type=float, default=0.01, help="Minimum Delta for early stopping") + args = parser.parse_args() + + return args + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py index 75288d7f1e..7254fc6327 100644 --- a/references/detection/train_pytorch.py +++ b/references/detection/train_pytorch.py @@ -4,6 +4,7 @@ # See LICENSE or go to for full license details. import os +from pathlib import Path os.environ["USE_TORCH"] = "1" @@ -12,7 +13,6 @@ import logging import multiprocessing as mp import time -from pathlib import Path import numpy as np import torch @@ -27,6 +27,32 @@ from doctr.utils.metrics import LocalizationConfusion from utils import EarlyStopper, plot_recorder, plot_samples +SLACK_WEBHOOK_URL = None +SLACK_WEBHOOK_PATH = Path(os.path.join(os.path.expanduser("~"), ".config", "doctr", "slack_webhook_url.txt")) +if SLACK_WEBHOOK_PATH.exists(): + with open(SLACK_WEBHOOK_PATH) as f: + SLACK_WEBHOOK_URL = f.read().strip() +else: + print(f"{SLACK_WEBHOOK_PATH} does not exist, skip Slack integration configuration...") + + +def send_on_slack(text: str): + """Send a message on Slack. + + Args: + text (str): message to send on Slack + """ + if SLACK_WEBHOOK_URL: + try: + import requests + + requests.post( + url=SLACK_WEBHOOK_URL, + json={"text": f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]: {text}"}, + ) + except Exception: + print("Impossible to send message on Slack, continue...") + def record_lr( model: torch.nn.Module, @@ -105,7 +131,10 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a model.train() # Iterate over the batches of the dataset + last_progress = 0 + interval_progress = 5 pbar = tqdm(train_loader, position=1) + send_on_slack(str(pbar)) for images, targets in pbar: if torch.cuda.is_available(): images = images.cuda() @@ -129,8 +158,12 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a optimizer.step() scheduler.step() - pbar.set_description(f"Training loss: {train_loss.item():.6}") + current_progress = pbar.n / pbar.total * 100 + if current_progress - last_progress > interval_progress: + send_on_slack(str(pbar)) + last_progress = int(current_progress) + send_on_slack(f"Final training loss: {train_loss.item():.6}") @torch.no_grad() @@ -139,9 +172,13 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False): model.eval() # Reset val metric val_metric.reset() + last_progress = 0 + interval_progress = 5 + pbar = tqdm(val_loader) + send_on_slack(str(pbar)) # Validation loop val_loss, batch_cnt = 0, 0 - for images, targets in tqdm(val_loader): + for images, targets in pbar: if torch.cuda.is_available(): images = images.cuda() images = batch_transforms(images) @@ -159,6 +196,10 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False): boxes_pred = np.concatenate((boxes_pred[:, :4].min(axis=1), boxes_pred[:, :4].max(axis=1)), axis=-1) val_metric.update(gts=boxes_gt, preds=boxes_pred[:, :4]) + current_progress = pbar.n / pbar.total * 100 + if current_progress - last_progress > interval_progress: + send_on_slack(str(pbar)) + last_progress = int(current_progress) val_loss += out["loss"].item() batch_cnt += 1 @@ -169,6 +210,7 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False): def main(args): print(args) + send_on_slack(f"Start training: {args}") if args.push_to_hub: login_to_hub() @@ -210,11 +252,80 @@ def main(args): collate_fn=val_set.collate_fn, ) print(f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {len(val_loader)} batches)") + send_on_slack( + f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {len(val_loader)} batches)" + ) with open(os.path.join(args.val_path, "labels.json"), "rb") as f: val_hash = hashlib.sha256(f.read()).hexdigest() batch_transforms = Normalize(mean=(0.798, 0.785, 0.772), std=(0.264, 0.2749, 0.287)) + # funsd_ds = DetectionDataset( + # img_folder=os.path.join(args.funsd_path, "images"), + # label_path=os.path.join(args.funsd_path, "labels.json"), + # sample_transforms=T.SampleCompose( + # ( + # [T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True)] + # if not args.rotation or args.eval_straight + # else [] + # ) + # + ( + # [ + # T.Resize(args.input_size, preserve_aspect_ratio=True), # This does not pad + # T.RandomApply(T.RandomRotate(90, expand=True), 0.5), + # T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True), + # ] + # if args.rotation and not args.eval_straight + # else [] + # ) + # ), + # use_polygons=args.rotation and not args.eval_straight, + # ) + + # funsd_test_loader = DataLoader( + # funsd_ds, + # batch_size=args.batch_size, + # drop_last=False, + # num_workers=args.workers, + # sampler=SequentialSampler(funsd_ds), + # pin_memory=torch.cuda.is_available(), + # collate_fn=funsd_ds.collate_fn, + # ) + # print(f"FUNSD Test set loaded in {time.time() - st:.4}s ({len(funsd_ds)} samples in " f"{len(funsd_test_loader)} batches)") + + # cord_ds = DetectionDataset( + # img_folder=os.path.join(args.cord_path, "images"), + # label_path=os.path.join(args.cord_path, "labels.json"), + # sample_transforms=T.SampleCompose( + # ( + # [T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True)] + # if not args.rotation or args.eval_straight + # else [] + # ) + # + ( + # [ + # T.Resize(args.input_size, preserve_aspect_ratio=True), # This does not pad + # T.RandomApply(T.RandomRotate(90, expand=True), 0.5), + # T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True), + # ] + # if args.rotation and not args.eval_straight + # else [] + # ) + # ), + # use_polygons=args.rotation and not args.eval_straight, + # ) + + # cord_test_loader = DataLoader( + # cord_ds, + # batch_size=args.batch_size, + # drop_last=False, + # num_workers=args.workers, + # sampler=SequentialSampler(cord_ds), + # pin_memory=torch.cuda.is_available(), + # collate_fn=cord_ds.collate_fn, + # ) + # print(f"CORD Test set loaded in {time.time() - st:.4}s ({len(cord_ds)} samples in " f"{len(funsd_test_loader)} batches)") + # Load doctr model model = detection.__dict__[args.arch]( pretrained=args.pretrained, @@ -225,6 +336,7 @@ def main(args): # Resume weights if isinstance(args.resume, str): print(f"Resuming {args.resume}") + send_on_slack(f"Resuming {args.resume}") checkpoint = torch.load(args.resume, map_location="cpu") model.load_state_dict(checkpoint) @@ -244,6 +356,16 @@ def main(args): model = model.cuda() # Metrics + # funsd_val_metric = LocalizationConfusion( + # use_polygons=args.rotation and not args.eval_straight, + # mask_shape=(args.input_size, args.input_size), + # use_broadcasting=True if system_available_memory > 62 else False, + # ) + # cord_val_metric = LocalizationConfusion( + # use_polygons=args.rotation and not args.eval_straight, + # mask_shape=(args.input_size, args.input_size), + # use_broadcasting=True if system_available_memory > 62 else False, + # ) val_metric = LocalizationConfusion(use_polygons=args.rotation and not args.eval_straight) if args.test_only: @@ -317,6 +439,9 @@ def main(args): collate_fn=train_set.collate_fn, ) print(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches)") + send_on_slack( + f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches)" + ) with open(os.path.join(args.train_path, "labels.json"), "rb") as f: train_hash = hashlib.sha256(f.read()).hexdigest() @@ -401,8 +526,23 @@ def main(args): fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp) # Validation loop at the end of each epoch val_loss, recall, precision, mean_iou = evaluate(model, val_loader, batch_transforms, val_metric, amp=args.amp) + funsd_recall, funsd_precision, funsd_mean_iou = 0.0, 0.0, 0.0 + cord_recall, cord_precision, cord_mean_iou = 0.0, 0.0, 0.0 + # try: + # _, funsd_recall, funsd_precision, funsd_mean_iou = evaluate( + # model, funsd_test_loader, batch_transforms, funsd_val_metric, amp=args.amp + # ) + # except Exception: + # pass + # try: + # _, cord_recall, cord_precision, cord_mean_iou = evaluate( + # model, cord_test_loader, batch_transforms, cord_val_metric, amp=args.amp + # ) + # except Exception: + # pass if val_loss < min_loss: print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") + send_on_slack(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") torch.save(model.state_dict(), Path(args.output_dir) / f"{exp_name}.pt") min_loss = val_loss if args.save_interval_epoch: @@ -412,8 +552,13 @@ def main(args): if any(val is None for val in (recall, precision, mean_iou)): log_msg += "(Undefined metric value, caused by empty GTs or predictions)" else: - log_msg += f"(Recall: {recall:.2%} | Precision: {precision:.2%} | Mean IoU: {mean_iou:.2%})" + log_msg += f"(Recall: {recall:.2%} | Precision: {precision:.2%} | Mean IoU: {mean_iou:.2%})\n" + log_msg += f"FUNSD: Recall: {funsd_recall:.2%} | Precision: {funsd_precision:.2%} | Mean IoU: {funsd_mean_iou:.2%}\n" + log_msg += ( + f"CORD: Recall: {cord_recall:.2%} | Precision: {cord_precision:.2%} | Mean IoU: {cord_mean_iou:.2%}" + ) print(log_msg) + send_on_slack(log_msg) # W&B if args.wb: wandb.log({ diff --git a/references/detection/train_tensorflow.py b/references/detection/train_tensorflow.py index b52e106ba5..b66b939a69 100644 --- a/references/detection/train_tensorflow.py +++ b/references/detection/train_tensorflow.py @@ -34,6 +34,32 @@ from doctr.utils.metrics import LocalizationConfusion from utils import EarlyStopper, plot_recorder, plot_samples +SLACK_WEBHOOK_URL = None +SLACK_WEBHOOK_PATH = Path(os.path.join(os.path.expanduser("~"), ".config", "doctr", "slack_webhook_url.txt")) +if SLACK_WEBHOOK_PATH.exists(): + with open(SLACK_WEBHOOK_PATH) as f: + SLACK_WEBHOOK_URL = f.read().strip() +else: + print(f"{SLACK_WEBHOOK_PATH} does not exist, skip Slack integration configuration...") + + +def send_on_slack(text: str): + """Send a message on Slack. + + Args: + text (str): message to send on Slack + """ + if SLACK_WEBHOOK_URL: + try: + import requests + + requests.post( + url=SLACK_WEBHOOK_URL, + json={"text": f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]: {text}"}, + ) + except Exception: + print("Impossible to send message on Slack, continue...") + def record_lr( model: Model, @@ -95,7 +121,10 @@ def apply_grads(optimizer, grads, model): def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False): train_iter = iter(train_loader) # Iterate over the batches of the dataset + last_progress = 0 + interval_progress = 5 pbar = tqdm(train_iter, position=1) + send_on_slack(str(pbar)) for images, targets in pbar: images = batch_transforms(images) @@ -107,6 +136,11 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False): apply_grads(optimizer, grads, model) pbar.set_description(f"Training loss: {train_loss.numpy():.6}") + current_progress = pbar.n / pbar.total * 100 + if current_progress - last_progress > interval_progress: + send_on_slack(str(pbar)) + last_progress = int(current_progress) + send_on_slack(f"Final training loss: {train_loss.numpy():.6}") def evaluate(model, val_loader, batch_transforms, val_metric): @@ -137,6 +171,7 @@ def evaluate(model, val_loader, batch_transforms, val_metric): def main(args): print(args) + send_on_slack(f"Start training: {args}") if args.push_to_hub: login_to_hub() @@ -176,6 +211,9 @@ def main(args): print( f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {val_loader.num_batches} batches)" ) + send_on_slack( + f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {val_loader.num_batches} batches)" + ) with open(os.path.join(args.val_path, "labels.json"), "rb") as f: val_hash = hashlib.sha256(f.read()).hexdigest() @@ -271,6 +309,9 @@ def main(args): print( f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {train_loader.num_batches} batches)" ) + send_on_slack( + f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {train_loader.num_batches} batches)" + ) with open(os.path.join(args.train_path, "labels.json"), "rb") as f: train_hash = hashlib.sha256(f.read()).hexdigest() @@ -379,6 +420,7 @@ def main(args): min_loss = val_loss if args.save_interval_epoch: print(f"Saving state at epoch: {epoch + 1}") + send_on_slack(f"Saving state at epoch: {epoch + 1}") model.save_weights(Path(args.output_dir) / f"{exp_name}_{epoch + 1}.weights.h5") log_msg = f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} " if any(val is None for val in (recall, precision, mean_iou)): @@ -386,6 +428,7 @@ def main(args): else: log_msg += f"(Recall: {recall:.2%} | Precision: {precision:.2%} | Mean IoU: {mean_iou:.2%})" print(log_msg) + send_on_slack(log_msg) # W&B if args.wb: wandb.log({