From 92c07b5f23fa0493e4bb887b0b783e23864c954f Mon Sep 17 00:00:00 2001 From: Toolkit User Date: Fri, 9 Jun 2023 14:10:47 +0000 Subject: [PATCH 1/2] Added notebook with code search eval. --- c2c_search_eval.ipynb | 273 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 273 insertions(+) create mode 100644 c2c_search_eval.ipynb diff --git a/c2c_search_eval.ipynb b/c2c_search_eval.ipynb new file mode 100644 index 0000000..81dc56d --- /dev/null +++ b/c2c_search_eval.ipynb @@ -0,0 +1,273 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Dict\n", + "import torch\n", + "from src import datasets_loader\n", + "from src.utils import retrieval_eval, pool_and_normalize\n", + "from src.constants import GFG_DATA_PATH\n", + "from transformers import AutoModel, AutoTokenizer\n", + "from src.datasets_loader import prepare_tokenizer\n", + "from src.preprocessing_utils import truncate_sentences\n", + "from abc import ABC, abstractmethod\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "DEVICE = \"cuda:0\"\n", + "\n", + "EVAL_CONFIGS =[\n", + " {\"model_path\": \"starencoder\", \"maximum_raw_length\": 10000, \"maximum_input_length\": 1024, \"device\": DEVICE},\n", + " {\"model_path\": \"codebert\", \"maximum_raw_length\": 10000, \"maximum_input_length\": 512, \"device\": DEVICE}\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def set_device(inputs: Dict[str, torch.Tensor], device: str) -> Dict[str, torch.Tensor]:\n", + " output_data = {}\n", + " for k, v in inputs.items():\n", + " output_data[k] = v.to(device)\n", + "\n", + " return output_data\n", + "\n", + "\n", + "def get_dataset(maximum_raw_length):\n", + " test_data = datasets_loader.get_dataset( # Geeks4Geeks data\n", + " dataset_name=\"gfg\",\n", + " path_to_cache=GFG_DATA_PATH,\n", + " split=\"test\",\n", + " maximum_raw_length=maximum_raw_length,\n", + " )\n", + "\n", + " return test_data\n", + "\n", + "\n", + "class BaseEncoder(torch.nn.Module, ABC):\n", + " def __init__(self, device, max_input_len, maximum_token_len, model_name):\n", + " super().__init__()\n", + "\n", + " self.model_name = model_name\n", + " self.tokenizer = prepare_tokenizer(model_name)\n", + " self.encoder = (\n", + " AutoModel.from_pretrained(model_name, use_auth_token=True).to(DEVICE).eval()\n", + " )\n", + " self.device = device\n", + " self.max_input_len = max_input_len\n", + " self.maximum_token_len = maximum_token_len\n", + "\n", + " @abstractmethod\n", + " def forward(\n", + " self,\n", + " ):\n", + " pass\n", + "\n", + " def encode(self, input_sentences, batch_size=32, **kwargs):\n", + " truncated_input_sentences = truncate_sentences(\n", + " input_sentences, self.max_input_len\n", + " )\n", + "\n", + " n_batches = len(truncated_input_sentences) // batch_size + int(\n", + " len(truncated_input_sentences) % batch_size > 0\n", + " )\n", + "\n", + " embedding_batch_list = []\n", + "\n", + " for i in range(n_batches):\n", + " start_idx = i * batch_size\n", + " end_idx = min((i + 1) * batch_size, len(truncated_input_sentences))\n", + "\n", + " with torch.no_grad():\n", + " embedding_batch_list.append(\n", + " self.forward(truncated_input_sentences[start_idx:end_idx])\n", + " .detach()\n", + " .cpu()\n", + " )\n", + "\n", + " return torch.cat(embedding_batch_list)\n", + "\n", + "\n", + "class StarEncoder(BaseEncoder):\n", + " def __init__(self, device, max_input_len, maximum_token_len):\n", + " super().__init__(\n", + " device,\n", + " max_input_len,\n", + " maximum_token_len,\n", + " model_name=\"bigcode/starencoder\",\n", + " )\n", + "\n", + " def forward(self, input_sentences):\n", + " inputs = self.tokenizer(\n", + " [\n", + " f\"{self.tokenizer.cls_token}{sentence}{self.tokenizer.sep_token}\"\n", + " for sentence in input_sentences\n", + " ],\n", + " padding=\"longest\",\n", + " max_length=self.maximum_token_len,\n", + " truncation=True,\n", + " return_tensors=\"pt\",\n", + " )\n", + "\n", + " outputs = self.encoder(**set_device(inputs, self.device))\n", + " embedding = pool_and_normalize(outputs.hidden_states[-1], inputs.attention_mask)\n", + "\n", + " return embedding\n", + "\n", + "\n", + "class CodeBERT(BaseEncoder):\n", + " def __init__(self, device, max_input_len, maximum_token_len):\n", + " super().__init__(\n", + " device,\n", + " max_input_len,\n", + " maximum_token_len,\n", + " model_name=\"microsoft/codebert-base\",\n", + " )\n", + "\n", + " self.tokenizer = AutoTokenizer.from_pretrained(\"microsoft/codebert-base\")\n", + "\n", + " def forward(self, input_sentences):\n", + " inputs = self.tokenizer(\n", + " [sentence for sentence in input_sentences],\n", + " padding=\"longest\",\n", + " max_length=self.maximum_token_len,\n", + " truncation=True,\n", + " return_tensors=\"pt\",\n", + " )\n", + "\n", + " inputs = set_device(inputs, self.device)\n", + "\n", + " outputs = self.encoder(inputs[\"input_ids\"], inputs[\"attention_mask\"])\n", + "\n", + " embedding = outputs[\"pooler_output\"]\n", + "\n", + " return torch.cat([torch.Tensor(el)[None, :] for el in embedding])\n", + "\n", + "\n", + "def evaluate(model_path, maximum_raw_length, maximum_input_length, device):\n", + " if \"starencoder\" in model_path.lower():\n", + " model = StarEncoder(\n", + " device, maximum_raw_length, maximum_input_length\n", + " )\n", + " elif \"codebert\" in model_path.lower():\n", + " model = CodeBERT(\n", + " device, maximum_raw_length, maximum_input_length\n", + " )\n", + " else:\n", + " raise ValueError(\n", + " \"Unsupported model type. We currently support starencoder and codebert.\"\n", + " )\n", + "\n", + " model = model.to(device)\n", + " model.eval()\n", + "\n", + " test_data = get_dataset(maximum_raw_length)\n", + "\n", + " source_entries, target_entries = [], []\n", + " for source, target in test_data:\n", + " source_entries.append(source)\n", + " target_entries.append(target)\n", + "\n", + " source_embeddings = model.encode(source_entries)\n", + " target_embeddings = model.encode(target_entries)\n", + "\n", + " recall_at_1, recall_at_5, mean_reciprocal_rank = retrieval_eval(\n", + " source_embeddings, target_embeddings\n", + " )\n", + "\n", + " print(\n", + " f\"\\n{model_path}: R@1: {recall_at_1.item()}, R@5: {recall_at_5.item()}, MRR: {mean_reciprocal_rank.item()}\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using pad_token, but it is not set yet.\n", + "Using sep_token, but it is not set yet.\n", + "Using cls_token, but it is not set yet.\n", + "Using mask_token, but it is not set yet.\n", + "Some weights of the model checkpoint at bigcode/starencoder were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']\n", + "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "Loading cached shuffled indices for dataset at /mnt/home/research-BertBigCode/resources/data/transcoder_evaluation_gfg/cache-e9f62aa12abed28d.arrow\n", + "Loading cached processed dataset at /mnt/home/research-BertBigCode/resources/data/transcoder_evaluation_gfg/cache-62c8dbaa90db85ee_*_of_00096.arrow\n", + "Loading cached shuffled indices for dataset at /mnt/home/research-BertBigCode/resources/data/transcoder_evaluation_gfg/cache-f652c1e33d8c1a14.arrow\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "starencoder: R@1: 0.7222222089767456, R@5: 0.8767361044883728, MRR: 0.7930026054382324\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading cached shuffled indices for dataset at /mnt/home/research-BertBigCode/resources/data/transcoder_evaluation_gfg/cache-e9f62aa12abed28d.arrow\n", + "Loading cached processed dataset at /mnt/home/research-BertBigCode/resources/data/transcoder_evaluation_gfg/cache-62c8dbaa90db85ee_*_of_00096.arrow\n", + "Loading cached shuffled indices for dataset at /mnt/home/research-BertBigCode/resources/data/transcoder_evaluation_gfg/cache-f652c1e33d8c1a14.arrow\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "codebert: R@1: 0.0052083334885537624, R@5: 0.02777777798473835, MRR: 0.025095948949456215\n" + ] + } + ], + "source": [ + "for eval_cfg in EVAL_CONFIGS:\n", + " evaluate(**eval_cfg)" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "ae635839a86c404533bb974203baf1bd26d9dc49bfbf145b45e9350c30045fdd" + }, + "kernelspec": { + "display_name": "Python 3.9.13 64-bit ('accelerate')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 11720f346b519d70385bc39ed1901afad76514dd Mon Sep 17 00:00:00 2001 From: Toolkit User Date: Fri, 9 Jun 2023 14:11:11 +0000 Subject: [PATCH 2/2] Added notebook code_bert_score eval. --- code_bert_score.ipynb | 306 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 306 insertions(+) create mode 100644 code_bert_score.ipynb diff --git a/code_bert_score.ipynb b/code_bert_score.ipynb new file mode 100644 index 0000000..da05fc9 --- /dev/null +++ b/code_bert_score.ipynb @@ -0,0 +1,306 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " import code_bert_score\n", + "except ImportError:\n", + " print(\n", + " \"This notebook requires a fork of code_bert_score to be installed:\", \n", + " \"pip install git+https://github.com/joaomonteirof/code-bert-score\"\n", + " )\n", + "from src import datasets_loader\n", + "from src.constants import GFG_DATA_PATH\n", + "import numpy as np\n", + "from matplotlib import pyplot as plt\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "DEVICE = \"cuda:0\"\n", + "MAX_RAW_LEN = 10000" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def get_dataset(maximum_raw_length):\n", + " test_data = datasets_loader.get_dataset( # Geeks4Geeks data\n", + " dataset_name=\"gfg\",\n", + " path_to_cache=GFG_DATA_PATH,\n", + " split=\"test\",\n", + " maximum_raw_length=maximum_raw_length,\n", + " )\n", + "\n", + " return test_data\n", + "\n", + "\n", + "def summarize_sequence(sequence):\n", + " def ci95(seq):\n", + " return 1.96 * np.std(seq) / np.sqrt(len(seq))\n", + "\n", + " return f\"{100*np.mean(sequence):.2f}+-{100*ci95(sequence):.2f}\"\n", + "\n", + "\n", + "def evaluate(model_path, maximum_raw_length):\n", + " test_data = get_dataset(maximum_raw_length)\n", + "\n", + " source_entries, target_entries = [], []\n", + " for source, target in test_data:\n", + " source_entries.append(source)\n", + " target_entries.append(target)\n", + "\n", + " return code_bert_score.score(\n", + " source_entries, target_entries, model_type=model_path, verbose=True\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading cached shuffled indices for dataset at /mnt/home/research-BertBigCode/resources/data/transcoder_evaluation_gfg/cache-e9f62aa12abed28d.arrow\n", + "Loading cached processed dataset at /mnt/home/research-BertBigCode/resources/data/transcoder_evaluation_gfg/cache-62c8dbaa90db85ee_*_of_00096.arrow\n", + "Loading cached shuffled indices for dataset at /mnt/home/research-BertBigCode/resources/data/transcoder_evaluation_gfg/cache-f652c1e33d8c1a14.arrow\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "calculating scores...\n", + "computing bert embedding.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e32a024f30be4952a6e16e8731c4924a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/18 [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(figsize=(6,6), facecolor=\"white\")\n", + "\n", + "ax.set_title(\"Python-Java cross-language CodeBERTScore\")\n", + "ax.get_xaxis().set_visible(False)\n", + "ax.set_ylabel(\"CodeBERTScore\")\n", + "\n", + "starencoder_ci = summarize_sequence(f1_starencoder.detach().cpu().numpy())\n", + "codebert_ci = summarize_sequence(f1_codebert.detach().cpu().numpy())\n", + "\n", + "starencoder_center, starencoder_error = starencoder_ci.split(\"+-\")\n", + "codebert_center, codebert_error = codebert_ci.split(\"+-\")\n", + "\n", + "plt.plot(1, float(codebert_center), color=\"blue\", label=\"codeBERT-base\")\n", + "plt.errorbar(1, float(codebert_center), yerr=float(codebert_error), fmt ='-o', color=\"blue\", capsize=8)\n", + "plt.plot(2, float(starencoder_center), color=\"orange\", label=\"StarEncoder\")\n", + "plt.errorbar(2, float(starencoder_center), yerr=float(starencoder_error), fmt ='-o', color=\"orange\", capsize=8)\n", + "\n", + "plt.legend(loc=\"lower right\")\n", + "plt.show()\n", + " " + ] + } + ], + "metadata": { + "interpreter": { + "hash": "ae635839a86c404533bb974203baf1bd26d9dc49bfbf145b45e9350c30045fdd" + }, + "kernelspec": { + "display_name": "Python 3.9.13 64-bit ('accelerate')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}