bigcode-encoder/src/datasets_loader.py at dev · bigcode-project/bigcode-encoder

441 lines (363 loc) · 16.3 KB
from typing import List, Dict, Union
import torch
from torch.utils.data import Dataset
import datasets
# Workaround toolkit misreporting available disk space.
datasets.builder.has_sufficient_disk_space = lambda needed_bytes, directory=".": True
from datasets import load_dataset, load_from_disk
from datasets.builder import DatasetBuildError
from transformers import AutoTokenizer
from src.preprocessing_utils import (
    perturb_tokens,
    get_special_tokens_mask,
    get_pooling_mask,
    pre_process_codesearchnet_train,
    pre_process_codesearchnet_test,
    pre_process_gfg,
    pre_process_the_stack,
from src.constants import MASK_TOKEN, PAD_TOKEN, SEPARATOR_TOKEN, CLS_TOKEN
DATASET_NAME_TO_PREPROCESSING_FUNCTION = {
    "the-stack": {
        "train": pre_process_the_stack,
        "test": pre_process_the_stack,
    "code_search_net": {
        "train": pre_process_codesearchnet_train,
        "test": pre_process_codesearchnet_test,
    "gfg": {
        "train": pre_process_gfg,
        "test": pre_process_gfg,
class RandomlyPairedDataset(Dataset):
    """Indexed dataset class with randomly picked negative pairs."""
    def __init__(self, base_dataset: datasets.Dataset) -> None:
        """Intanstiates an indexed dataset wrapping a base data source.
        We use this class to be able to get examples from the dataset including negative pairs.
        Args:
            base_dataset (datasets.Dataset): Base indexed data source.
        """
        self.data_source = base_dataset
    def __len__(self) -> int:
        """Returns the length of the dataset which matches that of the base data source.
        Returns:
            int: Dataset length.
        """
        return len(self.data_source)
    def __getitem__(self, i: int) -> List[Dict]:
        """Reads from the base dataset and returns an addition random entry that serves as negative example.
        Args:
            i (int): Index to be read.
        Returns:
            List[Dict]: Pair of examples. The example indexed by i is returned along with a different random point.
        """
        rand_idx = torch.randint(0, len(self.data_source), (1,)).item()
        while rand_idx == i:
            rand_idx = torch.randint(0, len(self.data_source), (1,)).item()
        example = self.data_source[i]
        negative_example = self.data_source[rand_idx]
        return example["source"], example["target"], negative_example["source"]
class PairedDataset(Dataset):
    """Indexed dataset class yielding source/target pairs."""
    def __init__(self, base_dataset: datasets.Dataset) -> None:
        """Intanstiates an indexed dataset wrapping a base data source.
        We use this class to be able to get paired examples from the base dataset.
        The base dataset must be pre-processed to include the fields 'source' and 'target'.
        Args:
            base_dataset (datasets.Dataset): Base indexed pre-processed data source.
        """
        self.data_source = base_dataset
    def __len__(self) -> int:
        """Returns the length of the dataset which matches that of the base data source.
        Returns:
            int: Dataset length.
        """
        return len(self.data_source)
    def __getitem__(self, i: int) -> List[Dict]:
        """Reads from the base dataset and returns a paier of examples.
        Args:
            i (int): Index to be read.
        Returns:
            List[Dict]: Pair of examples. The 'source' and 'target' fields of the example indexed by i are returned.
        """
        example = self.data_source[i]
        return example["source"], example["target"]
def get_dataset(
    dataset_name: str,
    path_to_cache: str,
    split: str,
    maximum_raw_length: int,
    force_preprocess: bool = False,
    maximum_row_cout: int = None,
) -> Union[PairedDataset, RandomlyPairedDataset]:
    """Get dataset instance.
        dataset_name (str): Name of the base dataset.
        path_to_cache (str): Path to the base dataset.
        split (str): data split in {'train', 'valid', 'test'}.
        maximum_raw_length (int, optional): Maximum length of the raw entries from the source dataset.
        force_preprocess (bool, optional): Whether to force pre-processing. Defaults to False.
        maximum_row_cout (int, optional) = Maximum size of the dataset in term of row count. Defaults to None.
    Returns:
        dataset: An indexed dataset object.
        base_dataset = load_dataset(
            dataset_name,
            use_auth_token=True,
            cache_dir=path_to_cache,
            split=split,
    except DatasetBuildError:
        # Try to specify data files. Specific for The Stack.
        base_dataset = load_dataset(
            dataset_name,
            use_auth_token=True,
            cache_dir=path_to_cache,
            data_files="sample.parquet",
            split=split,
    except FileNotFoundError:
        # Try to load from disk if above failed.
        base_dataset = load_from_disk(path_to_cache)
    if force_preprocess:
        base_dataset.cleanup_cache_files()
    base_dataset = base_dataset.shuffle(seed=42)
    if maximum_row_cout is not None:
        base_dataset = base_dataset.select(
            range(min(len(base_dataset), maximum_row_cout))
    if "train" in split.lower():
        split_preproc_key = "train"
        split_preproc_key = "test"
        pre_proc_fn = DATASET_NAME_TO_PREPROCESSING_FUNCTION[dataset_name][
            split_preproc_key
    except KeyError:
        for k in DATASET_NAME_TO_PREPROCESSING_FUNCTION.keys():
            if "the-stack" in dataset_name.lower() and "the-stack" in k:
                pre_proc_fn = DATASET_NAME_TO_PREPROCESSING_FUNCTION[k][
                    split_preproc_key
    base_dataset = base_dataset.map(pre_proc_fn(maximum_raw_length), num_proc=96)
    base_dataset = base_dataset.shuffle(seed=42)
    if "train" in split_preproc_key:
        return RandomlyPairedDataset(base_dataset)
        return PairedDataset(base_dataset)
def prepare_tokenizer(tokenizer_path):
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    except OSError:
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_auth_token=True)
    tokenizer.add_special_tokens({"pad_token": PAD_TOKEN})
    tokenizer.add_special_tokens({"sep_token": SEPARATOR_TOKEN})
    tokenizer.add_special_tokens({"cls_token": CLS_TOKEN})
    tokenizer.add_special_tokens({"mask_token": MASK_TOKEN})
    return tokenizer
class TrainCollator:
    """Train collator object mapping sequences of items from dataset instance
    into batches of IDs and masks used for training models.
    def __init__(
        self,
        tokenizer_path: str,
        maximum_length: int,
        mlm_masking_probability: float,
        contrastive_masking_probability: float,
        ignore_contrastive_loss_data: bool = False,
        **kwargs,
    ) -> None:
        """Creates instance of collator.
        Args:
            tokenizer_path (str): Path to tokenizer.
            maximum_length (int): Truncating length of token sequences.
            mlm_masking_probability (float): Masking probability for MLM objective.
            contrastive_masking_probability (float): Masking probability for contrastive objective.
            ignore_contrastive_loss_data (bool, optional): Do not add append positive pairs to batch. Defaults to False.
        """
        self.mlm_masking_probability = mlm_masking_probability
        self.contrastive_masking_probability = contrastive_masking_probability
        self.maximum_length = maximum_length
        self.ignore_contrastive_loss_data = ignore_contrastive_loss_data
        self.tokenizer = prepare_tokenizer(tokenizer_path)
        self.sep_token_id = self.tokenizer.get_vocab()[self.tokenizer.sep_token]
        self.pad_token_id = self.tokenizer.get_vocab()[self.tokenizer.pad_token]
        self.mask_token_id = self.tokenizer.get_vocab()[self.tokenizer.mask_token]
        self.cls_token_id = self.tokenizer.get_vocab()[self.tokenizer.cls_token]
    def __call__(self, batch: List[Dict]) -> Dict[str, torch.Tensor]:
        """Maps list of triplets of examples to batches of token ids, masks, and labels used for training.
        The firt two elements in a triplet correspond to neighbor chunkes from the same file. The third
        element corresponds to a chunk from a random file.
        Args:
            batch (List[Dict]): List of pairs of examples.
        Returns:
            Dict[str, torch.Tensor]: Batches of tokens, masks, and labels.
        """
        source_list = [
            el[0] for el in batch
        ]  # el[0] is the first half of a code snippet.
        # Following are the labels for the seq relationship loss: 0 -> negative pair, 1 -> positive pair.
        seq_relationship_labels = torch.randint(0, 2, (len(batch),)).long()
        target_list = [
            # seq_relationship_label==1 -> positive pair -> we take the second half of the code snippet
            # seq_relationship_label==0 -> negative pair -> we take a random code snippet given in el[1]
            el[1] if seq_relationship_labels[i] == 1 else el[2]
            for i, el in enumerate(batch)
        input_examples_list = [  # Combine source and target w/ template: [CLS] SOURCE [SEP] [TARGET] [SEP]
            f"{CLS_TOKEN}{source_list[i]}{SEPARATOR_TOKEN}{target_list[i]}{SEPARATOR_TOKEN}"
            for i in range(len(batch))
        input_examples_encoding = self.tokenizer(
            input_examples_list,
            padding="longest",
            max_length=self.maximum_length,
            truncation=True,
            return_tensors="pt",
        input_examples_ids = input_examples_encoding.input_ids
        input_examples_att_mask = (
            input_examples_encoding.attention_mask
        )  # Padding masks.
        special_tokens_mask = get_special_tokens_mask(
            self.tokenizer, input_examples_ids
        input_examples_ids, mlm_labels = perturb_tokens(
            input_examples_ids,
            special_tokens_mask,
            self.mlm_masking_probability,
            self.mask_token_id,
            len(self.tokenizer),
        )  # Dynamically perturbs input tokens and generates corresponding mlm labels.
        if not self.ignore_contrastive_loss_data:
            positive_examples_ids, positive_mlm_labels = perturb_tokens(
                input_examples_ids,
                special_tokens_mask,
                self.contrastive_masking_probability,
                self.mask_token_id,
                len(self.tokenizer),
            )  # Positve examples are independently perturbed versions of the source, used for the contrastive loss.
            input_ids = torch.cat([input_examples_ids, positive_examples_ids], 0)
            attention_mask = torch.cat(
                [input_examples_att_mask, input_examples_att_mask.clone()], 0
            pooling_mask = get_pooling_mask(
                input_ids, self.sep_token_id
            )  # Pooling masks indicate the first [SEP] occurrence, used for seq embedding.
            labels = torch.cat([mlm_labels, positive_mlm_labels], 0)
            next_sentence_label = torch.cat(
                [seq_relationship_labels, seq_relationship_labels.clone()], 0
        else:
            input_ids = input_examples_ids
            attention_mask = input_examples_att_mask
            pooling_mask = get_pooling_mask(
                input_ids, self.sep_token_id
            )  # Pooling masks indicate the first [SEP] occurrence, used for seq embedding.
            labels = mlm_labels
            next_sentence_label = seq_relationship_labels
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "pooling_mask": pooling_mask,
            "labels": labels,
            "next_sentence_label": next_sentence_label,
class TestCollator:
    """Test collator object mapping sequences of items from dataset instance
    into batches of IDs and masks used for training models.
    def __init__(self, tokenizer_path: str, maximum_length: int, **kwargs) -> None:
        """Creates instance of collator.
        Args:
            tokenizer_path (str): Path to tokenizer.
            maximum_length (int): Truncating length of token sequences.
        """
        self.maximum_length = maximum_length
        self.tokenizer = prepare_tokenizer(tokenizer_path)
        self.sep_token_id = self.tokenizer.get_vocab()[self.tokenizer.sep_token]
        self.pad_token_id = self.tokenizer.get_vocab()[self.tokenizer.pad_token]
        self.mask_token_id = self.tokenizer.get_vocab()[self.tokenizer.mask_token]
        self.cls_token_id = self.tokenizer.get_vocab()[self.tokenizer.cls_token]
    def __call__(self, batch: List[Dict]) -> Dict[str, torch.Tensor]:
        """Maps list of pairs of examples to batches of token ids, masks, and labels used for training.
        Args:
            batch (List[Dict]): List of pairs of examples.
        Returns:
            Dict[str, torch.Tensor]: Batches of tokens and masks.
        """
        source_list = [el[0] for el in batch]
        target_list = [el[1] for el in batch]
        source_examples_list = [
            f"{CLS_TOKEN}{source_list[i]}{SEPARATOR_TOKEN}" for i in range(len(batch))
        target_examples_list = [
            f"{CLS_TOKEN}{target_list[i]}{SEPARATOR_TOKEN}" for i in range(len(batch))
        source_target_examples_encoding = self.tokenizer(
            source_examples_list + target_examples_list,
            padding="longest",
            max_length=self.maximum_length,
            truncation=True,
            return_tensors="pt",
        source_target_examples_ids = source_target_examples_encoding.input_ids
        source_target_examples_att_mask = source_target_examples_encoding.attention_mask
        return {
            "source_target_ids": source_target_examples_ids,
            "source_target_att_mask": source_target_examples_att_mask,
            "return_loss": True,  # This is so Trainer.prediction_step() will call Trainer.compute_loss during eval()
class Collator:
    """Object wrapping both train and test collators.
    Decides which collator to use depending on the configuration of the batch.
    Pair of examples --> test instance
    Triplet of examples --> train instances
    def __init__(
        self,
        tokenizer_path: str,
        maximum_length: int,
        mlm_masking_probability: float = 0.5,
        contrastive_masking_probability: float = 0.5,
        ignore_contrastive_loss_data: bool = False,
    ) -> None:
        """Creates instance of collator.
        Args:
            tokenizer_path (str): Path to tokenizer.
            maximum_length (int): Truncating length of token sequences.
            mlm_masking_probability (float, optional): Masking probability for MLM objective. Defaults to 0.5.
            contrastive_masking_probability (float, optional): Masking probability for contrastive objective. Defaults to 0.5.
            ignore_contrastive_loss_data (bool, optional): Do not add append positive pairs to batch. Defaults to False.
        """
        self.train_collator = TrainCollator(
            tokenizer_path=tokenizer_path,
            maximum_length=maximum_length,
            mlm_masking_probability=mlm_masking_probability,
            contrastive_masking_probability=contrastive_masking_probability,
            ignore_contrastive_loss_data=ignore_contrastive_loss_data,
        self.test_collator = TestCollator(
            tokenizer_path=tokenizer_path,
            maximum_length=maximum_length,
        self.vocabulary_size = len(self.train_collator.tokenizer.vocab)
        self.pad_token_id = self.train_collator.pad_token_id
    def __call__(self, batch: List[Dict]) -> Dict[str, torch.Tensor]:
        """Maps list of pairs of examples to batches of token ids, masks, and labels used for training.
        Args:
            batch (List[Dict]): List of pairs of examples.
        Returns:
            Dict[str, torch.Tensor]: Batches of tokens and masks.
        """
        if len(batch[0]) == 3:
            return self.train_collator(batch)
        elif len(batch[0]) == 2:
            return self.test_collator(batch)
        else:
            raise AttributeError("Unknown batch configuration.")
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

datasets_loader.py

Latest commit

History

datasets_loader.py

File metadata and controls