From f6095f062fb9647ea47a47a1038aa96fa52c6fb6 Mon Sep 17 00:00:00 2001 From: Toolkit User Date: Thu, 23 Mar 2023 13:28:51 +0000 Subject: [PATCH 1/3] Upgraded requirements. --- requirements.txt | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/requirements.txt b/requirements.txt index 10b109c..fa7f081 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,16 @@ --find-links https://download.pytorch.org/whl/cu117 -torch==1.13.1+cu117 -numpy==1.22.4 +torch==2.0.0+cu117 +numpy==1.24.2 matplotlib==3.4.3 -sklearn==1.1.2 -pandas==1.4.3 +scikit-learn==1.2.2 +pandas==1.5.3 datasets==2.10.1 sentencepiece==0.1.97 -transformers==4.21.1 -accelerate==0.16.0 +transformers==4.27.2 +accelerate==0.17.1 beir==1.0.1 mteb==1.0.1 +seaborn wandb huggingface-cli -haven-ai @ git+https://github.com/haven-ai/haven-ai@00fe4e3a10bfe09fef361836b8fcfcffcecd3451 +haven-ai From 951b1886bc67a12769d77b1a36c6f628f4b82697 Mon Sep 17 00:00:00 2001 From: Toolkit User Date: Thu, 23 Mar 2023 13:36:50 +0000 Subject: [PATCH 2/3] Moving to a different implementation of all_gather. --- src/datasets_loader.py | 10 ++-------- src/distributed_utils.py | 36 ++++++++++++++++++++++++++++++++++++ src/utils.py | 8 +++++--- 3 files changed, 43 insertions(+), 11 deletions(-) create mode 100644 src/distributed_utils.py diff --git a/src/datasets_loader.py b/src/datasets_loader.py index 0de798e..32b52eb 100644 --- a/src/datasets_loader.py +++ b/src/datasets_loader.py @@ -129,10 +129,7 @@ def get_dataset( """ try: base_dataset = load_dataset( - dataset_name, - use_auth_token=True, - cache_dir=path_to_cache, - split=split + dataset_name, use_auth_token=True, cache_dir=path_to_cache, split=split ) except FileNotFoundError: try: @@ -168,13 +165,10 @@ def get_dataset( split_preproc_key ] - base_dataset = base_dataset.map( - pre_proc_fn(maximum_raw_length), num_proc=96 - ) + base_dataset = base_dataset.map(pre_proc_fn(maximum_raw_length), num_proc=96) base_dataset = base_dataset.shuffle(seed=42) - if "train" in split_preproc_key: return RandomlyPairedDataset(base_dataset) else: diff --git a/src/distributed_utils.py b/src/distributed_utils.py new file mode 100644 index 0000000..8f50e64 --- /dev/null +++ b/src/distributed_utils.py @@ -0,0 +1,36 @@ +import torch +import torch.distributed as dist + + +class AllGather(torch.autograd.Function): + """ + all_gather with gradient back-propagation + Adapted from https://github.com/Lightning-AI/lightning-bolts/blob/5577453a6d7072724d9ae24184daf8f45d4baff7/pl_bolts/models/self_supervised/simclr/simclr_module.py#L20-L40 + """ + + @staticmethod + def forward(ctx, tensor): + ctx.batch_size = tensor.shape[0] + + gathered_tensor = [ + torch.zeros_like(tensor) for _ in range(torch.distributed.get_world_size()) + ] + + torch.distributed.all_gather(gathered_tensor, tensor) + gathered_tensor = torch.cat(gathered_tensor, 0) + + return gathered_tensor + + @staticmethod + def backward(ctx, grad_output): + grad_input = grad_output.clone() + torch.distributed.all_reduce( + grad_input, op=torch.distributed.ReduceOp.SUM, async_op=False + ) + + idx_from = torch.distributed.get_rank() * ctx.batch_size + idx_to = (torch.distributed.get_rank() + 1) * ctx.batch_size + return grad_input[idx_from:idx_to] + + +all_gather = AllGather.apply diff --git a/src/utils.py b/src/utils.py index 5dc3956..ee81c2a 100644 --- a/src/utils.py +++ b/src/utils.py @@ -1,6 +1,6 @@ from typing import List, Union import torch -from accelerate.utils.operations import _gpu_gather +from src.distributed_utils import all_gather class TempCoef(torch.nn.Module): @@ -64,7 +64,8 @@ def gather_embeddings( 1, ) - embedding_dist = _gpu_gather(embedding) + # Gather embeddings across devices + embedding_dist = all_gather(embedding) embedding_1_dist = embedding_dist[:, 0, :] embedding_2_dist = embedding_dist[:, 1, :] @@ -96,9 +97,10 @@ def clip_contrastive_loss( # Gathers embeddings across devices. emb_1_dist, emb_2_dist = gather_embeddings(emb_1, emb_2) - # Compute cosine similarity matrix + # Compute similarity matrix similarities = emb_1_dist @ emb_2_dist.T + # Multiply similarity matrix by temperature similarities = temperature_coef(similarities) # Matching representations of positive pairs assumed to be located at the main From 03f2c1cf01b7109a2d27e9071786fd7d56cb87d2 Mon Sep 17 00:00:00 2001 From: Toolkit User Date: Thu, 23 Mar 2023 13:38:10 +0000 Subject: [PATCH 3/3] Modified hyperparams for the contrastive loss. --- exp_configs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/exp_configs.py b/exp_configs.py index 2a18ec8..370e894 100644 --- a/exp_configs.py +++ b/exp_configs.py @@ -31,18 +31,18 @@ "mlm_masking_probability": 0.15, }, "contrastive_local": { - "alpha": 0.5, + "alpha": 0.4, "initial_temperature_coef": 1.0725, # Matches initial value in clip. "local_contrastive_loss": True, "mlm_masking_probability": 0.15, - "contrastive_masking_probability": 0.3, + "contrastive_masking_probability": 0.2, }, "contrastive_global": { - "alpha": 0.5, + "alpha": 0.4, "initial_temperature_coef": 1.0725, # Matches initial value in clip. "local_contrastive_loss": False, "mlm_masking_probability": 0.15, - "contrastive_masking_probability": 0.3, + "contrastive_masking_probability": 0.2, }, }