bigcode-project · joaomonteirof · Jun 9, 2023 · Jun 9, 2023 · Jun 9, 2023
diff --git a/c2c_search_eval.ipynb b/c2c_search_eval.ipynb
@@ -0,0 +1,273 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import Dict\n",
+    "import torch\n",
+    "from src import datasets_loader\n",
+    "from src.utils import retrieval_eval, pool_and_normalize\n",
+    "from src.constants import GFG_DATA_PATH\n",
+    "from transformers import AutoModel, AutoTokenizer\n",
+    "from src.datasets_loader import prepare_tokenizer\n",
+    "from src.preprocessing_utils import truncate_sentences\n",
+    "from abc import ABC, abstractmethod\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DEVICE = \"cuda:0\"\n",
+    "\n",
+    "EVAL_CONFIGS =[\n",
+    "    {\"model_path\": \"starencoder\", \"maximum_raw_length\": 10000, \"maximum_input_length\": 1024, \"device\": DEVICE},\n",
+    "    {\"model_path\": \"codebert\", \"maximum_raw_length\": 10000, \"maximum_input_length\": 512, \"device\": DEVICE}\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def set_device(inputs: Dict[str, torch.Tensor], device: str) -> Dict[str, torch.Tensor]:\n",
+    "    output_data = {}\n",
+    "    for k, v in inputs.items():\n",
+    "        output_data[k] = v.to(device)\n",
+    "\n",
+    "    return output_data\n",
+    "\n",
+    "\n",
+    "def get_dataset(maximum_raw_length):\n",
+    "    test_data = datasets_loader.get_dataset(  # Geeks4Geeks data\n",
+    "        dataset_name=\"gfg\",\n",
+    "        path_to_cache=GFG_DATA_PATH,\n",
+    "        split=\"test\",\n",
+    "        maximum_raw_length=maximum_raw_length,\n",
+    "    )\n",
+    "\n",
+    "    return test_data\n",
+    "\n",
+    "\n",
+    "class BaseEncoder(torch.nn.Module, ABC):\n",
+    "    def __init__(self, device, max_input_len, maximum_token_len, model_name):\n",
+    "        super().__init__()\n",
+    "\n",
+    "        self.model_name = model_name\n",
+    "        self.tokenizer = prepare_tokenizer(model_name)\n",
+    "        self.encoder = (\n",
+    "            AutoModel.from_pretrained(model_name, use_auth_token=True).to(DEVICE).eval()\n",
+    "        )\n",
+    "        self.device = device\n",
+    "        self.max_input_len = max_input_len\n",
+    "        self.maximum_token_len = maximum_token_len\n",
+    "\n",
+    "    @abstractmethod\n",
+    "    def forward(\n",
+    "        self,\n",
+    "    ):\n",
+    "        pass\n",
+    "\n",
+    "    def encode(self, input_sentences, batch_size=32, **kwargs):\n",
+    "        truncated_input_sentences = truncate_sentences(\n",
+    "            input_sentences, self.max_input_len\n",
+    "        )\n",
+    "\n",
+    "        n_batches = len(truncated_input_sentences) // batch_size + int(\n",
+    "            len(truncated_input_sentences) % batch_size > 0\n",
+    "        )\n",
+    "\n",
+    "        embedding_batch_list = []\n",
+    "\n",
+    "        for i in range(n_batches):\n",
+    "            start_idx = i * batch_size\n",
+    "            end_idx = min((i + 1) * batch_size, len(truncated_input_sentences))\n",
+    "\n",
+    "            with torch.no_grad():\n",
+    "                embedding_batch_list.append(\n",
+    "                    self.forward(truncated_input_sentences[start_idx:end_idx])\n",
+    "                    .detach()\n",
+    "                    .cpu()\n",
+    "                )\n",
+    "\n",
+    "        return torch.cat(embedding_batch_list)\n",
+    "\n",
+    "\n",
+    "class StarEncoder(BaseEncoder):\n",
+    "    def __init__(self, device, max_input_len, maximum_token_len):\n",
+    "        super().__init__(\n",
+    "            device,\n",
+    "            max_input_len,\n",
+    "            maximum_token_len,\n",
+    "            model_name=\"bigcode/starencoder\",\n",
+    "        )\n",
+    "\n",
+    "    def forward(self, input_sentences):\n",
+    "        inputs = self.tokenizer(\n",
+    "            [\n",
+    "                f\"{self.tokenizer.cls_token}{sentence}{self.tokenizer.sep_token}\"\n",
+    "                for sentence in input_sentences\n",
+    "            ],\n",
+    "            padding=\"longest\",\n",
+    "            max_length=self.maximum_token_len,\n",
+    "            truncation=True,\n",
+    "            return_tensors=\"pt\",\n",
+    "        )\n",
+    "\n",
+    "        outputs = self.encoder(**set_device(inputs, self.device))\n",
+    "        embedding = pool_and_normalize(outputs.hidden_states[-1], inputs.attention_mask)\n",
+    "\n",
+    "        return embedding\n",
+    "\n",
+    "\n",
+    "class CodeBERT(BaseEncoder):\n",
+    "    def __init__(self, device, max_input_len, maximum_token_len):\n",
+    "        super().__init__(\n",
+    "            device,\n",
+    "            max_input_len,\n",
+    "            maximum_token_len,\n",
+    "            model_name=\"microsoft/codebert-base\",\n",
+    "        )\n",
+    "\n",
+    "        self.tokenizer = AutoTokenizer.from_pretrained(\"microsoft/codebert-base\")\n",
+    "\n",
+    "    def forward(self, input_sentences):\n",
+    "        inputs = self.tokenizer(\n",
+    "            [sentence for sentence in input_sentences],\n",
+    "            padding=\"longest\",\n",
+    "            max_length=self.maximum_token_len,\n",
+    "            truncation=True,\n",
+    "            return_tensors=\"pt\",\n",
+    "        )\n",
+    "\n",
+    "        inputs = set_device(inputs, self.device)\n",
+    "\n",
+    "        outputs = self.encoder(inputs[\"input_ids\"], inputs[\"attention_mask\"])\n",
+    "\n",
+    "        embedding = outputs[\"pooler_output\"]\n",
+    "\n",
+    "        return torch.cat([torch.Tensor(el)[None, :] for el in embedding])\n",
+    "\n",
+    "\n",
+    "def evaluate(model_path, maximum_raw_length, maximum_input_length, device):\n",
+    "    if \"starencoder\" in model_path.lower():\n",
+    "        model = StarEncoder(\n",
+    "            device, maximum_raw_length, maximum_input_length\n",
+    "        )\n",
+    "    elif \"codebert\" in model_path.lower():\n",
+    "        model = CodeBERT(\n",
+    "            device, maximum_raw_length, maximum_input_length\n",
+    "        )\n",
+    "    else:\n",
+    "        raise ValueError(\n",
+    "            \"Unsupported model type. We currently support starencoder and codebert.\"\n",
+    "        )\n",
+    "\n",
+    "    model = model.to(device)\n",
+    "    model.eval()\n",
+    "\n",
+    "    test_data = get_dataset(maximum_raw_length)\n",
+    "\n",
+    "    source_entries, target_entries = [], []\n",
+    "    for source, target in test_data:\n",
+    "        source_entries.append(source)\n",
+    "        target_entries.append(target)\n",
+    "\n",
+    "    source_embeddings = model.encode(source_entries)\n",
+    "    target_embeddings = model.encode(target_entries)\n",
+    "\n",
+    "    recall_at_1, recall_at_5, mean_reciprocal_rank = retrieval_eval(\n",
+    "        source_embeddings, target_embeddings\n",
+    "    )\n",
+    "\n",
+    "    print(\n",
+    "        f\"\\n{model_path}: R@1: {recall_at_1.item()}, R@5: {recall_at_5.item()}, MRR: {mean_reciprocal_rank.item()}\"\n",
+    "        )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using pad_token, but it is not set yet.\n",
+      "Using sep_token, but it is not set yet.\n",
+      "Using cls_token, but it is not set yet.\n",
+      "Using mask_token, but it is not set yet.\n",
+      "Some weights of the model checkpoint at bigcode/starencoder were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']\n",
+      "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "Loading cached shuffled indices for dataset at /mnt/home/research-BertBigCode/resources/data/transcoder_evaluation_gfg/cache-e9f62aa12abed28d.arrow\n",
+      "Loading cached processed dataset at /mnt/home/research-BertBigCode/resources/data/transcoder_evaluation_gfg/cache-62c8dbaa90db85ee_*_of_00096.arrow\n",
+      "Loading cached shuffled indices for dataset at /mnt/home/research-BertBigCode/resources/data/transcoder_evaluation_gfg/cache-f652c1e33d8c1a14.arrow\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "starencoder: R@1: 0.7222222089767456, R@5: 0.8767361044883728, MRR: 0.7930026054382324\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading cached shuffled indices for dataset at /mnt/home/research-BertBigCode/resources/data/transcoder_evaluation_gfg/cache-e9f62aa12abed28d.arrow\n",
+      "Loading cached processed dataset at /mnt/home/research-BertBigCode/resources/data/transcoder_evaluation_gfg/cache-62c8dbaa90db85ee_*_of_00096.arrow\n",
+      "Loading cached shuffled indices for dataset at /mnt/home/research-BertBigCode/resources/data/transcoder_evaluation_gfg/cache-f652c1e33d8c1a14.arrow\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "codebert: R@1: 0.0052083334885537624, R@5: 0.02777777798473835, MRR: 0.025095948949456215\n"
+     ]
+    }
+   ],
+   "source": [
+    "for eval_cfg in EVAL_CONFIGS:\n",
+    "    evaluate(**eval_cfg)"
+   ]
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "ae635839a86c404533bb974203baf1bd26d9dc49bfbf145b45e9350c30045fdd"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.9.13 64-bit ('accelerate')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}