From 5d431003b9c5e318fe315b1e9bb6eea78349917f Mon Sep 17 00:00:00 2001
From: Lim Chee Kin <limcheekin@vobject.com>
Date: Tue, 13 Jun 2023 11:53:39 +0800
Subject: [PATCH 1/9] feat: added files for aws lambda deployment

---
 .github/workflows/dev.yml    | 29 ++++++++++++++++
 .github/workflows/rm-dev.yml | 18 ++++++++++
 Dockerfile.aws               | 65 ++++++++++++++++++++++++++++++++++++
 llama_cpp/server/aws.py      |  8 +++++
 requirements.txt             |  5 +++
 serverless.yml               | 43 ++++++++++++++++++++++++
 6 files changed, 168 insertions(+)
 create mode 100644 .github/workflows/dev.yml
 create mode 100644 .github/workflows/rm-dev.yml
 create mode 100644 Dockerfile.aws
 create mode 100644 llama_cpp/server/aws.py
 create mode 100644 requirements.txt
 create mode 100644 serverless.yml

diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml
new file mode 100644
index 0000000000..f2639eeb71
--- /dev/null
+++ b/.github/workflows/dev.yml
@@ -0,0 +1,29 @@
+name: Deploy Dev
+on:
+  workflow_dispatch:
+    branches:
+      - main
+jobs:
+  deploy-dev:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v3
+        with:
+          submodules: "true"
+      - name: Setup NodeJS 18
+        uses: actions/setup-node@v3
+        with:
+          node-version: 18
+      - name: Install Serverless Framework
+        run: npm install -g serverless
+      - name: Serverless AWS authentication
+        run: sls config credentials --provider aws --key ${{ secrets.AWS_KEY }} --secret ${{ secrets.AWS_SECRET }}
+      - name: Deploy Lambda functions
+        run: sls deploy
+      - name: Export Endpoint URL
+        run: echo $(sls info --verbose | grep endpoint | sed s/endpoint\:\ //g | awk '{print $1}') > endpoint
+      - name: Echo Endpoint URL
+        run: echo $(cat endpoint)
+      - name: Test Lambda functions
+        run: "curl -X POST -H 'Content-Type: application/json' -d @prompt.json $(cat endpoint)v1/completions"
diff --git a/.github/workflows/rm-dev.yml b/.github/workflows/rm-dev.yml
new file mode 100644
index 0000000000..88137ef0c0
--- /dev/null
+++ b/.github/workflows/rm-dev.yml
@@ -0,0 +1,18 @@
+name: Remove Dev
+on: workflow_dispatch
+
+jobs:
+  rm-dev:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Setup NodeJS 18
+        uses: actions/setup-node@v3
+        with:
+          node-version: 18
+      - name: Install Serverless Framework
+        run: npm install -g serverless
+      - name: Serverless AWS authentication
+        run: sls config credentials --provider aws --key ${{ secrets.AWS_KEY }} --secret ${{ secrets.AWS_SECRET }}
+      - name: Remove Lambda functions
+        run: sls remove
diff --git a/Dockerfile.aws b/Dockerfile.aws
new file mode 100644
index 0000000000..e25e713ced
--- /dev/null
+++ b/Dockerfile.aws
@@ -0,0 +1,65 @@
+# REF: https://aws.amazon.com/blogs/aws/new-for-aws-lambda-container-image-support/
+# The download size of `python:3.10-slim-bullseye` is **45MB**¹. Its uncompressed on-disk size is **125MB**¹.
+# (1) The best Docker base image for your Python application (March 2023). https://pythonspeed.com/articles/base-image-python-docker-images/.
+# (2) Reduce the size of container images with DockerSlim. https://developers.redhat.com/articles/2022/01/17/reduce-size-container-images-dockerslim.
+# Define global args
+ARG FUNCTION_DIR="/home/app/"
+ARG RUNTIME_VERSION="3.10"
+
+# Stage 1 - bundle base image + runtime
+# Grab a fresh copy of the image and install GCC
+FROM python:${RUNTIME_VERSION}-slim-bullseye AS python-slim-bullseye
+
+# Stage 2 - build function and dependencies
+FROM python-slim-bullseye AS build-image
+# Install aws-lambda-cpp build dependencies
+# REF: https://docs.aws.amazon.com/lambda/latest/dg/images-create.html
+RUN apt-get update && \
+    apt-get install -y \
+    libopenblas-dev \
+    ninja-build \
+    build-essential \
+    curl
+
+# Include global args in this stage of the build
+ARG FUNCTION_DIR
+ARG RUNTIME_VERSION
+# Create function directory
+RUN mkdir -p ${FUNCTION_DIR}
+# Install the function's dependencies
+COPY requirements.txt ./
+RUN python${RUNTIME_VERSION} -m pip install -r requirements.txt --target ${FUNCTION_DIR}
+
+COPY ./ ${FUNCTION_DIR}
+
+RUN python${RUNTIME_VERSION} -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette
+# REF: https://github.com/abetlen/llama-cpp-python/blob/main/Dockerfile
+RUN cd ${FUNCTION_DIR} && CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 python${RUNTIME_VERSION} setup.py develop
+# Install Lambda Runtime Interface Client for Python
+RUN python${RUNTIME_VERSION} -m pip install awslambdaric --target ${FUNCTION_DIR}
+
+# download the model file
+RUN mkdir ${FUNCTION_DIR}/model
+RUN curl -L https://huggingface.co/SlyEcho/open_llama_3b_ggml/resolve/main/open-llama-3b-q4_0.bin -o ${FUNCTION_DIR}/model/ggml-q4_0.bin
+
+# Stage 3 - final runtime image
+# Grab a fresh copy of the Python image
+FROM python-slim-bullseye
+
+# Install runtime dependencies
+RUN apt-get update && \
+    apt-get install -y \
+    libopenblas-dev
+
+# Include global arg in this stage of the build
+ARG FUNCTION_DIR
+# Set working directory to function root directory
+WORKDIR ${FUNCTION_DIR}
+# Copy in the built dependencies
+COPY --from=build-image ${FUNCTION_DIR} ${FUNCTION_DIR}
+# (Optional) Add Lambda Runtime Interface Emulator and use a script in the ENTRYPOINT for simpler local runs
+ADD https://github.com/aws/aws-lambda-runtime-interface-emulator/releases/latest/download/aws-lambda-rie /usr/bin/aws-lambda-rie
+COPY entry.sh /
+RUN chmod 755 /usr/bin/aws-lambda-rie /entry.sh
+ENTRYPOINT [ "/entry.sh" ]
+CMD [ "llama_cpp.server.aws.handler" ]
diff --git a/llama_cpp/server/aws.py b/llama_cpp/server/aws.py
new file mode 100644
index 0000000000..e1774b124b
--- /dev/null
+++ b/llama_cpp/server/aws.py
@@ -0,0 +1,8 @@
+"""AWS Lambda function for llama.cpp.
+"""
+from mangum import Mangum
+from llama_cpp.server.app import create_app
+
+handler = Mangum(create_app())
+
+
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000..71fa3ea5c1
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+typing_extensions
+fastapi
+sse-starlette
+mangum
+numpy
\ No newline at end of file
diff --git a/serverless.yml b/serverless.yml
new file mode 100644
index 0000000000..b4ee38f9e7
--- /dev/null
+++ b/serverless.yml
@@ -0,0 +1,43 @@
+service: llama-cpp
+
+frameworkVersion: "3"
+
+provider:
+  name: aws
+  deploymentMethod: direct
+  # REF: https://www.serverless.com/blog/container-support-for-lambda
+  ecr:
+    # In this section you can define images that will be built locally and uploaded to ECR
+    images:
+      appimage:
+        path: ./
+        file: Dockerfile.aws
+  stage: dev
+  region: ap-southeast-1
+  iam:
+    role:
+      statements:
+        - Effect: "Allow"
+          Action:
+            - "lambda:InvokeFunction"
+          Resource: "*"
+
+functions:
+  chat:
+    image:
+      name: appimage
+    memorySize: 3072
+    environment:
+      MODEL: ./model/ggml-q4_0.bin
+    timeout:
+      900
+      # https://www.serverless.com/framework/docs/providers/aws/guide/functions#lambda-function-urls
+    url:
+      # Allow CORS for all requests from any origin
+      cors:
+        allowedOrigins:
+          - "*"
+          #- https://url1.com
+          #- https://url2.com
+        allowedMethods:
+          - POST

From e9045422ea89068bb782b9fb310d6ad82e79cc69 Mon Sep 17 00:00:00 2001
From: Lim Chee Kin <limcheekin@vobject.com>
Date: Tue, 13 Jun 2023 11:55:09 +0800
Subject: [PATCH 2/9] feat: added prompt.json to test for deployed aws lambda
 function

---
 prompt.json | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 prompt.json

diff --git a/prompt.json b/prompt.json
new file mode 100644
index 0000000000..c08f9f7c4f
--- /dev/null
+++ b/prompt.json
@@ -0,0 +1,6 @@
+{
+  "prompt": [
+    "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n"
+  ],
+  "stop": ["\n", "###"]
+}

From 241b0305dea2a7829f2cb053d1f3fe791261d1c6 Mon Sep 17 00:00:00 2001
From: Lim Chee Kin <limcheekin@vobject.com>
Date: Tue, 13 Jun 2023 12:05:22 +0800
Subject: [PATCH 3/9] feat: added the missing entry.sh file

---
 entry.sh | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 entry.sh

diff --git a/entry.sh b/entry.sh
new file mode 100644
index 0000000000..a608361e2b
--- /dev/null
+++ b/entry.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+if [ -z "${AWS_LAMBDA_RUNTIME_API}" ]; then
+    exec /usr/bin/aws-lambda-rie /usr/local/bin/python -m awslambdaric $1
+else
+    exec /usr/local/bin/python -m awslambdaric $1
+fi

From a6ca3e32b585eec14a5b430304ee2c00037ee320 Mon Sep 17 00:00:00 2001
From: Lim Chee Kin <limcheekin@vobject.com>
Date: Tue, 13 Jun 2023 12:20:51 +0800
Subject: [PATCH 4/9] feat: added diskcache as dependency

---
 requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 71fa3ea5c1..7b5155b0de 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,4 +2,5 @@ typing_extensions
 fastapi
 sse-starlette
 mangum
-numpy
\ No newline at end of file
+numpy
+diskcache
\ No newline at end of file

From b2a3b21e35bc4a01b8a07e0829c9f6a12686961e Mon Sep 17 00:00:00 2001
From: Lim Chee Kin <limcheekin@vobject.com>
Date: Tue, 27 Jun 2023 16:01:40 +0800
Subject: [PATCH 5/9] feat: deploy orca_mini_13B-GGML model

---
 Dockerfile.aws          | 2 +-
 llama_cpp/server/aws.py | 9 +++++----
 serverless.yml          | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/Dockerfile.aws b/Dockerfile.aws
index e25e713ced..96cb06537a 100644
--- a/Dockerfile.aws
+++ b/Dockerfile.aws
@@ -40,7 +40,7 @@ RUN python${RUNTIME_VERSION} -m pip install awslambdaric --target ${FUNCTION_DIR
 
 # download the model file
 RUN mkdir ${FUNCTION_DIR}/model
-RUN curl -L https://huggingface.co/SlyEcho/open_llama_3b_ggml/resolve/main/open-llama-3b-q4_0.bin -o ${FUNCTION_DIR}/model/ggml-q4_0.bin
+RUN curl -L https://huggingface.co/TheBloke/orca_mini_13B-GGML/resolve/main/orca-mini-13b.ggmlv3.q4_0.bin -o ${FUNCTION_DIR}/model/ggml-q4_0.bin
 
 # Stage 3 - final runtime image
 # Grab a fresh copy of the Python image
diff --git a/llama_cpp/server/aws.py b/llama_cpp/server/aws.py
index e1774b124b..46629480c8 100644
--- a/llama_cpp/server/aws.py
+++ b/llama_cpp/server/aws.py
@@ -1,8 +1,9 @@
 """AWS Lambda function for llama.cpp.
 """
 from mangum import Mangum
-from llama_cpp.server.app import create_app
-
-handler = Mangum(create_app())
-
+from llama_cpp.server.app import create_app, Settings
+import os
 
+print("os.cpu_count()", os.cpu_count())
+handler = Mangum(create_app(
+    Settings(n_threads=os.cpu_count(), embedding=False)))
diff --git a/serverless.yml b/serverless.yml
index b4ee38f9e7..d370c0cb1c 100644
--- a/serverless.yml
+++ b/serverless.yml
@@ -26,7 +26,7 @@ functions:
   chat:
     image:
       name: appimage
-    memorySize: 3072
+    memorySize: 10240
     environment:
       MODEL: ./model/ggml-q4_0.bin
     timeout:

From 96d991a723d27fd3f3656ead8a6f928774ef0e45 Mon Sep 17 00:00:00 2001
From: Lim Chee Kin <limcheekin@vobject.com>
Date: Tue, 27 Jun 2023 16:48:15 +0800
Subject: [PATCH 6/9] chore: changed to orca-mini-7b as failed in 13b

---
 Dockerfile.aws | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.aws b/Dockerfile.aws
index 96cb06537a..96cf36177f 100644
--- a/Dockerfile.aws
+++ b/Dockerfile.aws
@@ -40,7 +40,7 @@ RUN python${RUNTIME_VERSION} -m pip install awslambdaric --target ${FUNCTION_DIR
 
 # download the model file
 RUN mkdir ${FUNCTION_DIR}/model
-RUN curl -L https://huggingface.co/TheBloke/orca_mini_13B-GGML/resolve/main/orca-mini-13b.ggmlv3.q4_0.bin -o ${FUNCTION_DIR}/model/ggml-q4_0.bin
+RUN curl -L https://huggingface.co/TheBloke/orca_mini_7B-GGML/resolve/main/orca-mini-7b.ggmlv3.q4_0.bin -o ${FUNCTION_DIR}/model/ggml-q4_0.bin
 
 # Stage 3 - final runtime image
 # Grab a fresh copy of the Python image

From 958f501c3a7a1976152a0b3f3547a55ff677f284 Mon Sep 17 00:00:00 2001
From: Lim Chee Kin <limcheekin@vobject.com>
Date: Mon, 10 Jul 2023 20:01:22 +0800
Subject: [PATCH 7/9] chore: updated to orca-mini-v2-7b

---
 Dockerfile.aws | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.aws b/Dockerfile.aws
index 96cf36177f..46c2b57cd7 100644
--- a/Dockerfile.aws
+++ b/Dockerfile.aws
@@ -40,7 +40,7 @@ RUN python${RUNTIME_VERSION} -m pip install awslambdaric --target ${FUNCTION_DIR
 
 # download the model file
 RUN mkdir ${FUNCTION_DIR}/model
-RUN curl -L https://huggingface.co/TheBloke/orca_mini_7B-GGML/resolve/main/orca-mini-7b.ggmlv3.q4_0.bin -o ${FUNCTION_DIR}/model/ggml-q4_0.bin
+RUN curl -L https://huggingface.co/TheBloke/orca_mini_v2_7B-GGML/resolve/main/orca-mini-v2_7b.ggmlv3.q4_0.bin -o ${FUNCTION_DIR}/model/ggml-q4_0.bin
 
 # Stage 3 - final runtime image
 # Grab a fresh copy of the Python image

From 2d83e8873391e52707e8bd69051e193a90b343b1 Mon Sep 17 00:00:00 2001
From: Lim Chee Kin <limcheekin@vobject.com>
Date: Mon, 10 Jul 2023 21:13:46 +0800
Subject: [PATCH 8/9] chore: added pydantic_settings dependency

---
 requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 7b5155b0de..e2b91c07cc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,4 +3,5 @@ fastapi
 sse-starlette
 mangum
 numpy
-diskcache
\ No newline at end of file
+diskcache
+pydantic_settings
\ No newline at end of file

From 5f9f46e8b86e535fb92b34d0891da5c0e66c9c98 Mon Sep 17 00:00:00 2001
From: Lim Chee Kin <limcheekin@vobject.com>
Date: Mon, 14 Aug 2023 17:22:34 +0800
Subject: [PATCH 9/9] chore: updated docker file to test the latest code

---
 Dockerfile.aws   | 5 +++--
 vendor/llama.cpp | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/Dockerfile.aws b/Dockerfile.aws
index 46c2b57cd7..2f1983c074 100644
--- a/Dockerfile.aws
+++ b/Dockerfile.aws
@@ -19,6 +19,7 @@ RUN apt-get update && \
     libopenblas-dev \
     ninja-build \
     build-essential \
+    pkg-config \
     curl
 
 # Include global args in this stage of the build
@@ -34,13 +35,13 @@ COPY ./ ${FUNCTION_DIR}
 
 RUN python${RUNTIME_VERSION} -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette
 # REF: https://github.com/abetlen/llama-cpp-python/blob/main/Dockerfile
-RUN cd ${FUNCTION_DIR} && CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 python${RUNTIME_VERSION} setup.py develop
+RUN cd ${FUNCTION_DIR} && CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" FORCE_CMAKE=1 python${RUNTIME_VERSION} setup.py develop
 # Install Lambda Runtime Interface Client for Python
 RUN python${RUNTIME_VERSION} -m pip install awslambdaric --target ${FUNCTION_DIR}
 
 # download the model file
 RUN mkdir ${FUNCTION_DIR}/model
-RUN curl -L https://huggingface.co/TheBloke/orca_mini_v2_7B-GGML/resolve/main/orca-mini-v2_7b.ggmlv3.q4_0.bin -o ${FUNCTION_DIR}/model/ggml-q4_0.bin
+RUN curl -L https://huggingface.co/TheBloke/orca_mini_v3_7B-GGML/resolve/main/orca_mini_v3_7b.ggmlv3.q4_0.bin -o ${FUNCTION_DIR}/model/ggml-q4_0.bin
 
 # Stage 3 - final runtime image
 # Grab a fresh copy of the Python image
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index f5bfea0580..edcc7ae7d2 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit f5bfea0580e417f99850d5456ca541d871a3e48c
+Subproject commit edcc7ae7d26007bbf83136e9d33f863fcad9b871