diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 00000000..4ced35a7
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,6 @@
+blank_issues_enabled: true
+contact_links:
+  - name: 🤔 Ask a Question
+    url: 'https://github.com/pinecone-io/canopy/discussions/new?category=q-a'
+    about: Ask a question about how to use Canopy using GitHub discussions
+
diff --git a/.github/actions/install-deps-and-canopy/action.yml b/.github/actions/install-deps-and-canopy/action.yml
index 7ee0fddc..c1fc8785 100644
--- a/.github/actions/install-deps-and-canopy/action.yml
+++ b/.github/actions/install-deps-and-canopy/action.yml
@@ -9,7 +9,6 @@ inputs:
     description: "Whether to install canopy library, or dependencies only"
     required: true
     default: "true"
-
 runs:
   using: "composite"
   steps:
@@ -37,8 +36,8 @@ runs:
   - name: Install dependencies
     shell: bash
     if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
-    run: poetry install --no-interaction --no-root --all-extras --with dev
+    run: make install-extras POETRY_INSTALL_ARGS="--no-interaction --no-root --with dev"
   - name: Install project
     if: ${{ inputs.install-canopy == 'true' }}
     shell: bash
-    run: poetry install --no-interaction --all-extras --with dev
+    run: make install-extras POETRY_INSTALL_ARGS="--with dev --no-interaction"
diff --git a/.github/workflows/build-push-image.yml b/.github/workflows/build-push-image.yml
index 45326c47..aac133e7 100644
--- a/.github/workflows/build-push-image.yml
+++ b/.github/workflows/build-push-image.yml
@@ -48,12 +48,19 @@ jobs:
             type=semver,pattern={{version}},enable=${{ github.event_name == 'push' }}
             type=raw,value=latest,enable=${{ github.event_name != 'push' }}
             type=raw,value=${{inputs.version}},enable=${{ github.event_name != 'push' }}
+      - name: Create build args
+        run: |
+          export POETRY_INSTALL_ARGS="$(make print-var VAR=POETRY_DEFAULT_EXTRAS)"
+          echo "POETRY_INSTALL_ARGS=$POETRY_INSTALL_ARGS" >> $GITHUB_OUTPUT
+        id: build-args
       - name: Build and push
         uses: docker/build-push-action@v5
         with:
           context: .
           platforms: linux/amd64
           push: true
+          build-args: |
+            POETRY_INSTALL_ARGS=${{steps.build-args.outputs.POETRY_INSTALL_ARGS}}
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
           provenance: false
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d37bd399..092f871a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,36 @@
+## [0.8.0] - 2024-02-15
+### Breaking changes
+* Added support for Pydantic v2 [#288](https://github.com/pinecone-io/canopy/pull/288)
+
+**Full Changelog**: https://github.com/pinecone-io/canopy/compare/v0.7.0...v0.8.0
+
+## [0.7.0] - 2024-02-15
+### Breaking changes
+* Move config directory to be part of the canopy package [#278](https://github.com/pinecone-io/canopy/pull/278)
+
+### Bug fixes
+* Fix building images on release [#252](https://github.com/pinecone-io/canopy/pull/252)
+* Exporting the correct module CohereRecordEncoder [#264](https://github.com/pinecone-io/canopy/pull/264) (Thanks @tomaarsen!)
+* Fixed GRPC support [#270](https://github.com/pinecone-io/canopy/pull/270)
+* Change the minimum version of FastAPI to 0.93.0 [#279](https://github.com/pinecone-io/canopy/pull/279)
+* Reduce the docker image size [#277](https://github.com/pinecone-io/canopy/pull/277)
+
+### Added
+* Generalize chunk creation [#258](https://github.com/pinecone-io/canopy/pull/258)
+* Add SentenceTransformersRecordEncoder [#263](https://github.com/pinecone-io/canopy/pull/263) (Thanks @tomaarsen!)
+* Add HybridRecordEncoder [#265](https://github.com/pinecone-io/canopy/pull/265)
+* Make transformers optional & allow pinecone-text with dense optional [#266](https://github.com/pinecone-io/canopy/pull/266)
+* Add cohere reranker [#269](https://github.com/pinecone-io/canopy/pull/269)
+* Add dimension support for OpenAI embeddings [#273](https://github.com/pinecone-io/canopy/pull/273)
+* Include config template files inside the package and add a CLI command to dump them [#287](https://github.com/pinecone-io/canopy/pull/287)
+
+### Documentation
+* Add contributing guide [#254](https://github.com/pinecone-io/canopy/pull/254)
+* Update README [#267](https://github.com/pinecone-io/canopy/pull/267) (Thanks @aulorbe!)
+* Fixed typo in dense.py docstring [#280](https://github.com/pinecone-io/canopy/pull/280) (Thanks @ptorru!)
+
+**Full Changelog**: https://github.com/pinecone-io/canopy/compare/v0.6.0...v0.7.0
+
 ## [0.6.0] - 2024-01-16
 ### Breaking changes
 * Pinecone serverless support [#246](https://github.com/pinecone-io/canopy/pull/246)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 00000000..8d5d680f
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,157 @@
+# Contributing to Canopy
+Thank you for considering contributing to Canopy! We appreciate the time and effort you put
+into making this project better. Following these guidelines will help streamline the process
+and make it easier for both contributors and maintainers.
+
+
+## Issues
+If you encounter any [issues](https://github.com/pinecone-io/canopy/issues/new/choose) while using the project, please report them. 
+Include a detailed description of the problem, steps to reproduce it, and your environment details.
+
+For any question, please use the `Discussions` section rather than opening an issue. This helps keep the issue tracker 
+focused on bugs and feature requests.
+
+## Feature requests
+If you have a feature request, please open an issue and describe the feature you would like to see, using the "Feature request" template.
+
+## Contributing code
+
+It is really simple to get started and create a pull request. Canopy is released regularly, so, you should see your
+improvements released in a matter of days or weeks 🚀
+
+If this is your first contribution to Canopy, you can start by looking at issues with the
+["good first issue"](https://github.com/pinecone-io/canopy/issues?q=is:issue+is:open+label:%22good+first+issue%22)
+label on GitHub.  
+If you find an issue that you'd like to work on, please assign the issue to yourself and leave a comment to let others know that you are working on it. Feel free to start a discussion on the issue to discuss optional designs or approaches.
+
+### Building from source
+If you are planning to contribute to Canopy, you will need to create your own fork of the repository. 
+If you just want to test the code locally, you can clone the repository directly.
+
+1. Fork the repository on GitHub and clone your fork locally.
+
+    ```bash
+    # Clone your fork and cd into the repo directory
+    git clone git@github.com:<your username>/canopy.git
+    cd canopy
+    ```
+2. Install poetry, which is required for dependency management. It is recommended to install poetry in a virtual environment.
+    You can install poetry using pip
+    ```bash
+   pip install poetry
+    ```
+   or using the following command
+
+    ```bash
+    # Install poetry
+    curl -sSL https://install.python-poetry.org | python3 -
+    ```
+3. Install the dependencies and dev dependencies
+    ```bash
+    # Install canopy, dependencies and dev dependencies
+   poetry install --with dev
+    ```
+4. Set up accounts and define environment variables 
+   Please refer to the [README](./README.md#mandatory-environment-variables) for more details.
+5. Remember to activate the virtual environment before running any commands
+    ```bash
+    # Activate the virtual environment
+    poetry shell
+    ```
+   or alternatively, you can run the commands directly using `poetry run`
+    ```bash
+    # Run the command inside the virtual environment
+    poetry run <command>
+    ```
+#### Optional - installing extra dependencies
+Canopy has a few optional dependencies, mostly for additional service providers. If you want to use Canopy with any of these providers, please make sure to install the relevant extra. For example, to use Canopy with Cohere, you should install with:
+ ```bash
+ # Install canopy, with the cohere extra
+ poetry install --with dev --extras cohere
+  ```
+
+### Running tests
+Canopy uses unit tests, system tests and end-to-end tests. Unit tests verify the functionality of each code module, without any external dependencies. System tests verify integration with services like Pinecone DB and OpenAI API. End-to-End tests verify the functionality of the entire Canopy server.     
+System and end-to-end tests require valid API keys for Pinecone and Open AI. Some optional providers require additional environment variables, and are otherwise skipped.  
+You can create a single `.env` file in the root directory of the repository and set all the environment variables there.
+
+To run all tests, run the following command:
+```bash
+# Run all tests
+poetry run pytest tests/
+```
+You can also run only one type of tests using the following commands:
+```bash
+# Run unit tests
+poetry run pytest tests/unit
+
+# Run system tests
+poetry run pytest tests/system
+
+# Run end-to-end tests
+poetry run pytest tests/e2e
+```
+
+### Check out a new branch and make your changes
+Create a new branch for your changes.
+
+```bash
+# Checkout a new branch and make your changes
+git checkout -b my-new-feature-branch
+# Make your changes...
+```
+
+### Document your changes
+When contributing to Canopy, please make sure that all code is well documented. 
+
+The following should be documented using properly formatted docstrings:
+
+- Modules
+- Class definitions
+- Function definitions
+- Module-level variables
+
+Canopy uses [Google-style docstrings](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings) formatted 
+according to [PEP 257](https://www.python.org/dev/peps/pep-0257/) guidelines. 
+(See [Example Google Style Python Docstrings](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) 
+for further examples.)
+
+[pydoclint](https://github.com/jsh9/pydoclint) is used for linting docstrings. You can run `make lint` to check your docstrings.
+
+If you are making changes to the public API, please update the documentation in the README.md file.
+
+### Add the relevant tests
+All code changes to Canopy need to be covered by tests. After making your changes, make sure to add relevant unit tests. 
+Tests that require external integration (e.g. communication with an API or service) should be placed under the `tests/system/` directory.  
+
+Please make an effort to avoid code duplication. Some unit tests have a common base class that can be extended. Other tests use fixtures to parameterize test cases over several subclasses. Instead of copy-pasting other test cases, try to utilize these mechanisms as much as possible.
+
+### Run linting, static type checking and unit tests
+Run the following to make sure everything is working as expected:
+
+```bash
+# Run unit tests
+make test-unit
+# If you don't have make installed, you can run the following command instead
+poetry run pytest tests/unit
+
+# Lint the code
+make lint 
+# Or alternatively
+poetry run flake8 .
+
+# Run static type checking
+make static
+# Or
+poetry run mypy src
+```
+(There are a few more sub-commands in Makefile like which you might want to use. You can run `make help` to see more options.)
+
+### Commit your changes, push to GitHub, and open a Pull Request
+
+Commit your changes, push your branch to GitHub, the use GitHub's website to create a pull request.
+
+Please follow the pull request template and fill in as much information as possible. Link to any relevant issues and include a description of your changes.
+
+
+
diff --git a/Dockerfile b/Dockerfile
index 13813cb0..cd042651 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -3,6 +3,7 @@
 
 ARG PYTHON_VERSION=3.11.7
 ARG PORT=8000
+ARG POETRY_INSTALL_ARGS=""
 ################################
 # PYTHON-BASE
 # Sets up all our shared environment variables
@@ -63,9 +64,10 @@ WORKDIR /app
 COPY pyproject.toml ./
 RUN poetry lock
 
+ARG POETRY_INSTALL_ARGS
 # install runtime deps to VIRTUAL_ENV
 RUN --mount=type=cache,target=/root/.cache \
-    poetry install --no-root --all-extras --only main
+    poetry install --no-root --only main $POETRY_INSTALL_ARGS
 
 
 ################################
@@ -78,13 +80,13 @@ WORKDIR /app
 COPY --from=builder-base /app/pyproject.toml pyproject.toml
 COPY --from=builder-base /app/poetry.lock poetry.lock
 
-
+ARG POETRY_INSTALL_ARGS
 # quicker install as runtime deps are already installed
 RUN --mount=type=cache,target=/root/.cache \
-    poetry install --no-root --all-extras --with dev
+    poetry install --no-root --with dev $POETRY_INSTALL_ARGS
 
 COPY . .
-RUN poetry install --all-extras --only-root
+RUN poetry install --only-root $POETRY_INSTALL_ARGS
 
 ARG PORT
 EXPOSE $PORT
@@ -101,7 +103,7 @@ FROM python-base as production
 ENV WORKER_COUNT=1
 
 LABEL org.opencontainers.image.source="https://github.com/pinecone-io/canopy"
-LABEL org.opencontainers.image.description="Image containing the canopy server."
+LABEL org.opencontainers.image.description="Retrieval Augmented Generation (RAG) framework and context engine powered by Pinecone"
 LABEL org.opencontainers.image.licenses="Apache-2.0"
 
 RUN DEBIAN_FRONTEND=noninteractive apt-get update && \
@@ -119,9 +121,9 @@ COPY --from=builder-base /app/pyproject.toml pyproject.toml
 COPY --from=builder-base /app/poetry.lock poetry.lock
 
 COPY src/ src/
-COPY config/ config/
 RUN touch README.md
-RUN poetry install --all-extras --only-root
+ARG POETRY_INSTALL_ARGS
+RUN poetry install --only-root $POETRY_INSTALL_ARGS
 
 ARG PORT
 EXPOSE $PORT
diff --git a/Makefile b/Makefile
index acfdec22..897d9f10 100644
--- a/Makefile
+++ b/Makefile
@@ -1,20 +1,24 @@
 TEST_WORKER_COUNT = 8
 
-REPOSITORY = ghcr.io/pinecone-io/canopy
+POETRY_DEFAULT_EXTRAS = -E cohere -E transformers -E grpc
+POETRY_INSTALL_ARGS =
 
+REPOSITORY = ghcr.io/pinecone-io/canopy
 IMAGE_TAG = $(shell poetry version -s)
+
 CONTAINER_PORT = 8000
 CONTAINER_ENV_FILE = .env
 CONTAINER_BUILD_DIR = .
 CONTAINER_BUILD_PLATFORM = linux/amd64
-CONTAINER_COMMON_BUILD_ARGS = --progress plain --platform $(CONTAINER_BUILD_PLATFORM) --build-arg PORT=$(CONTAINER_PORT)
-CONTAINER_EXTRA_BUILD_ARGS =
+CONTAINER_SYSTEM_BUILD_ARGS = --progress plain --platform $(CONTAINER_BUILD_PLATFORM) --build-arg PORT=$(CONTAINER_PORT) --build-arg POETRY_INSTALL_ARGS="$(POETRY_DEFAULT_EXTRAS) $(POETRY_INSTALL_ARGS)"
+CONTAINER_BUILD_ARGS =
 
 # Only add the env file if it exists
-CONTAINER_COMMON_RUN_ARGS = --platform linux/amd64 -p $(CONTAINER_PORT):$(CONTAINER_PORT) $(shell [ -e "$(CONTAINER_ENV_FILE)" ] && echo "--env-file $(CONTAINER_ENV_FILE)")
-CONTAINER_EXTRA_RUN_ARGS =
+CONTAINER_SYSTEM_RUN_ARGS = --platform linux/amd64 -p $(CONTAINER_PORT):$(CONTAINER_PORT) $(shell [ -e "$(CONTAINER_ENV_FILE)" ] && echo "--env-file $(CONTAINER_ENV_FILE)")
+CONTAINER_RUN_ARGS =
 
-.PHONY: lint static test test-unit test-system test-e2e docker-build docker-build-dev docker-run docker-run-dev help
+
+.PHONY: lint static install install-extras install-all-extras test test-unit test-system test-e2e docker-build docker-build-dev docker-run docker-run-dev print-var help
 
 lint:
 	poetry run flake8 .
@@ -22,6 +26,15 @@ lint:
 static:
 	poetry run mypy src
 
+install:
+	poetry install $(POETRY_INSTALL_ARGS)
+
+install-extras:
+	poetry install $(POETRY_DEFAULT_EXTRAS) $(POETRY_INSTALL_ARGS)
+
+install-all-extras:
+	poetry install --all-extras $(POETRY_INSTALL_ARGS)
+
 test:
 	poetry run pytest -n $(TEST_WORKER_COUNT) --dist loadscope
 
@@ -36,33 +49,43 @@ test-e2e:
 
 docker-build:
 	@echo "Building Docker image..."
-	docker build $(CONTAINER_COMMON_BUILD_ARGS) $(CONTAINER_EXTRA_BUILD_ARGS) -t $(REPOSITORY):$(IMAGE_TAG) $(CONTAINER_BUILD_DIR)
+	docker build $(CONTAINER_SYSTEM_BUILD_ARGS) $(CONTAINER_BUILD_ARGS) -t $(REPOSITORY):$(IMAGE_TAG) $(CONTAINER_BUILD_DIR)
 	@echo "Docker build complete."
 
 docker-build-dev:
 	@echo "Building Docker image for development..."
-	docker build $(CONTAINER_COMMON_BUILD_ARGS) $(CONTAINER_EXTRA_BUILD_ARGS) -t $(REPOSITORY)-dev:$(IMAGE_TAG) --target=development $(CONTAINER_BUILD_DIR)
+	docker build $(CONTAINER_SYSTEM_BUILD_ARGS) $(CONTAINER_BUILD_ARGS) -t $(REPOSITORY)-dev:$(IMAGE_TAG) --target=development $(CONTAINER_BUILD_DIR)
 	@echo "Development Docker build complete."
 
 docker-run:
-	docker run $(CONTAINER_COMMON_RUN_ARGS) $(CONTAINER_EXTRA_RUN_ARGS) $(REPOSITORY):$(IMAGE_TAG)
+	docker run $(CONTAINER_SYSTEM_RUN_ARGS) $(CONTAINER_RUN_ARGS) $(REPOSITORY):$(IMAGE_TAG)
 
 docker-run-dev:
-	docker run $(CONTAINER_COMMON_RUN_ARGS) $(CONTAINER_EXTRA_RUN_ARGS) -it $(REPOSITORY)-dev:$(IMAGE_TAG)
+	docker run $(CONTAINER_SYSTEM_RUN_ARGS) $(CONTAINER_RUN_ARGS) -it $(REPOSITORY)-dev:$(IMAGE_TAG)
+
+print-var:
+	@echo "$($(VAR))"
 
 help:
 	@echo "Available targets:"
 	@echo ""
 	@echo " -- DEV -- "
-	@echo "  make lint               - Lint the code."
-	@echo "  make static             - Run static type checks."
-	@echo "  make test               - Test the code."
-	@echo "  make test-unit          - Run unit tests."
-	@echo "  make test-system        - Run system tests."
-	@echo "  make test-e2e           - Run e2e tests."
+	@echo "  make install                     - Install only the required dependencies without any extras."
+	@echo "  make install-extras              - Install the dependencies with the default extras."
+	@echo "  make install-all-extras          - Install the dependencies with all extras."
+	@echo "  make lint                        - Lint the code."
+	@echo "  make static                      - Run static type checks."
+	@echo "  make test                        - Test the code."
+	@echo "  make test-unit                   - Run unit tests."
+	@echo "  make test-system                 - Run system tests."
+	@echo "  make test-e2e                    - Run e2e tests."
 	@echo ""
 	@echo " -- DOCKER -- "
-	@echo "  make docker-build       - Build the Docker image."
-	@echo "  make docker-build-dev   - Build the Docker image for development."
-	@echo "  make docker-run         - Run the Docker image."
-	@echo "  make docker-run-dev     - Run the Docker image for development."
+	@echo "  make docker-build                - Build the Docker image."
+	@echo "  make docker-build-dev            - Build the Docker image for development."
+	@echo "  make docker-run                  - Run the Docker image."
+	@echo "  make docker-run-dev              - Run the Docker image for development."
+	@echo ""
+	@echo " -- MISC -- "
+	@echo "  make print-var VAR=<variable>    - Print the value of a variable."
+
diff --git a/README.md b/README.md
index f59d8e47..89cf012a 100644
--- a/README.md
+++ b/README.md
@@ -58,6 +58,23 @@ source canopy-env/bin/activate
 pip install canopy-sdk
 ```
 
+<details>
+<summary>You can also install canopy-sdk with extras. <b><u>CLICK HERE</u></b> to see the available extras
+
+<br /> 
+</summary>
+
+### Extras
+
+| Name           | Description                                                                                                                                              |
+|----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `grpc`         | To unlock some performance improvements by working with the GRPC version of the [Pinecone Client](https://github.com/pinecone-io/pinecone-python-client) |
+| `torch`        | To enable embeddings provided by [sentence-transformers](https://www.sbert.net/)                                                                         |
+| `transformers` | If you are using Anyscale LLMs, it's recommended to use `LLamaTokenizer` tokenizer which requires transformers as dependency                             |
+| `cohere`       | To use Cohere reranker or/and Cohere LLM                                                                                                                 |
+
+</details>
+
 2. Set up the environment variables
 
 ```bash
@@ -195,11 +212,6 @@ INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
 ### Stopping the server 
 To stop the server, simply press `CTRL+C` in the terminal where you started it.
 
-If you have started the server in the background, you can stop it by running:
-
-```bash
-canopy stop
-```
 
 ## Evaluation chat tool
 
@@ -225,9 +237,12 @@ This will open a similar chat interface window, but will show both the RAG and n
 
 ## Considerations
 
-* Canopy currently only supports OpenAI as the backend for both the embedding model and the LLM. Rate limits and pricing set by OpenAI will apply.  
+* Rate limits and pricing set by model providers apply to Canopy usage. Canopy currently works with OpenAI, Azure OpenAI, Anyscale, and Cohere models.
 * More integrations will be supported in the near future.
 
+## Contributing
+Thank you for considering contributing to Canopy! Please see our [contributing guidelines](./CONTRIBUTING.md) for more information.
+
 ## Advanced usage
 
 ### Migrating an existing OpenAI application to **Canopy**
@@ -250,19 +265,18 @@ client = OpenAI(base_url="http://localhost:8000/v1/my-namespace")
 
 ### Running Canopy server in production
 
-Canopy is using FastAPI as the web framework and Uvicorn as the ASGI server. It is recommended to use Gunicorn as the production server, mainly because it supports multiple worker processes and can handle multiple requests in parallel, more details can be found [here](https://www.uvicorn.org/deployment/#using-a-process-manager).
-
-To run the canopy server for production, please run:
+Canopy is using FastAPI as the web framework and Uvicorn as the ASGI server.  
+To use Canopy in production, it is recommended to utilize Canopy's docker image, available on [GitHub Packages](https://github.com/pinecone-io/canopy/pkgs/container/canopy), 
+for your production needs.  
+For guidance on deploying Canopy on the Google Cloud Platform (GCP), refer to the example provided in the
+[Deployment to GCP](docs/deployment-gcp.md) documentation.
 
+Alternatively, you can use Gunicorn as production-grade WSGI, more details [here](https://www.uvicorn.org/deployment/#using-a-process-manager).  
+Set your desired `PORT` and `WORKER_COUNT` envrionment variables, and start the server with:
 ```bash
-gunicorn canopy_server.app:app --worker-class uvicorn.workers.UvicornWorker --bind 0.0.0.0:PORT --workers WORKER_COUNT
+gunicorn canopy_server.app:app --worker-class uvicorn.workers.UvicornWorker --bind 0.0.0.0:$PORT --workers $WORKER_COUNT
 ```
 
-Alternatively, consider utilizing the Canopy Docker image available on [GitHub Packages](https://github.com/pinecone-io/canopy/pkgs/container/canopy) 
-for your production needs. For guidance on deploying Canopy on the Google Cloud Platform (GCP), refer to the example provided in the
-[Deployment to GCP](docs/deployment-gcp.md) documentation.
-
-
 > [!IMPORTANT]
 >  The server interacts with services like Pinecone and OpenAI using your own authentication credentials. 
    When deploying the server on a public web hosting provider, it is recommended to enable an authentication mechanism, 
diff --git a/examples/canopy-lib-quickstart.ipynb b/examples/canopy-lib-quickstart.ipynb
index d25cd768..1005ad54 100644
--- a/examples/canopy-lib-quickstart.ipynb
+++ b/examples/canopy-lib-quickstart.ipynb
@@ -753,7 +753,7 @@
     "     def chunk_single_document(self, document: Document) -> List[KBDocChunk]:\n",
     "        line_chunks = [chunk\n",
     "                       for chunk in document.text.split(\"\\n\")]\n",
-    "        return [KBDocChunk(id=f\"{document.id}_{i}\",\n",
+    "        return [KBDocChunk(id=self.generate_chunk_id(document.id, i),\n",
     "                           document_id=document.id,\n",
     "                           text=text_chunk,\n",
     "                           source=document.source,\n",
diff --git a/pyproject.toml b/pyproject.toml
index 0057789b..56fe49ab 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "canopy-sdk"
-version = "0.6.0"
+version = "0.8.0"
 description = "Retrieval Augmented Generation (RAG) framework and context engine powered by Pinecone"
 authors = ["Relevance Team <relevance@pinecone.io>"]
 readme = "README.md"
@@ -11,13 +11,12 @@ packages = [{include = "canopy", from = "src"},
 
 [tool.poetry.dependencies]
 python = ">=3.9,<3.13"
-pinecone-client = "^3.0.0"
 python-dotenv = "^1.0.0"
 openai = "^1.2.3"
 tiktoken = "^0.3.3"
-pydantic = "^1.10.7"
+pydantic = "^2.0.0"
 pandas-stubs = "^2.0.3.230814"
-fastapi = ">=0.92.0, <1.0.0"
+fastapi = ">=0.93.0, <1.0.0"
 uvicorn = ">=0.20.0, <1.0.0"
 tenacity = "^8.2.1"
 sse-starlette = "^1.6.5"
@@ -28,22 +27,45 @@ types-pyyaml = "^6.0.12.12"
 jsonschema = "^4.2.0"
 types-jsonschema = "^4.2.0"
 prompt-toolkit = "^3.0.39"
-pinecone-text = "^0.7.2"
 tokenizers = "^0.15.0"
-transformers = "^4.35.2"
+transformers = {version = "^4.35.2", optional = true}
 sentencepiece = "^0.1.99"
 pandas = "2.0.0"
 pyarrow = "^14.0.1"
 cohere = { version = ">=4.37", optional = true }
 qdrant-client = "^1.7.2"
 
+
+pinecone-text =  "^0.8.0"
+# Extra: torch (Relies on pinecone-text[dense])
+# Dependencies here should be synced with pinecone-text's pyproject.toml
+# See: https://github.com/pinecone-io/pinecone-text/blob/0eb00a202f5c9bc8cc48c8b7536fcbabf95f096e/pyproject.toml#L30
+torch = { version = ">=1.13.1", optional = true }
+sentence-transformers = { version = ">=2.0.0", optional = true }
+
+
+pinecone-client =  "^3.0.0"
+# Extra: grpc (Relies on pinecone-client[grpc])
+# Dependencies here should be synced with pinecone-python-client's pyproject.toml
+# See: https://github.com/pinecone-io/pinecone-python-client/blob/886f932b66521a6ab5b1e076f6a53ba2f16eb41b/pyproject.toml#L94
+grpcio = { version = ">=1.44.0", optional = true }
+grpc-gateway-protoc-gen-openapiv2 = { version = "0.1.0", optional = true }
+googleapis-common-protos = { version = ">=1.53.0", optional = true }
+lz4 = { version = ">=3.1.3", optional = true }
+protobuf = { version = "~=3.20.0", optional = true }
+
+
+
 [tool.poetry.extras]
 cohere = ["cohere"]
+torch = ["torch", "sentence-transformers"]
+transformers = ["transformers"]
+grpc = ["grpcio", "grpc-gateway-protoc-gen-openapiv2", "googleapis-common-protos", "lz4", "protobuf"]
 
 
 [tool.poetry.group.dev.dependencies]
-jupyter = "^1.0.0"
 pytest = "^7.3.2"
+jupyter = "^1.0.0"
 mypy = "^1.4.1"
 flake8 = "^6.1.0"
 pytest-html = "^4.1.0"
@@ -71,7 +93,11 @@ module = [
     'pinecone_text.*',
     'pinecone_datasets',
     'pinecone',
-    'transformers.*'
+    'transformers.*',
+    'tokenizers.*',
+    'cohere.*',
+    'pinecone.grpc',
+    'huggingface_hub.utils'
 ]
 ignore_missing_imports = true
 
diff --git a/src/canopy/chat_engine/chat_engine.py b/src/canopy/chat_engine/chat_engine.py
index c6090021..645fca17 100644
--- a/src/canopy/chat_engine/chat_engine.py
+++ b/src/canopy/chat_engine/chat_engine.py
@@ -1,4 +1,3 @@
-import os
 from abc import ABC, abstractmethod
 from typing import Iterable, Union, Optional, cast
 
@@ -13,9 +12,7 @@
                                       StreamingChatResponse, )
 from canopy.models.data_models import Context, Messages, SystemMessage
 from canopy.utils.config import ConfigurableMixin
-
-CE_DEBUG_INFO = os.getenv("CE_DEBUG_INFO", "FALSE").lower() == "true"
-
+from canopy.utils.debugging import CANOPY_DEBUG_INFO
 
 DEFAULT_SYSTEM_PROMPT = """Use the following pieces of context to answer the user question at the next messages. This context retrieved from a knowledge database and you should use only the facts from the context to answer. Always remember to include the source to the documents you used from their 'source' field in the format 'Source: $SOURCE_HERE'.
 If you don't know the answer, just say that you don't know, don't try to make up an answer, use the context.
@@ -223,8 +220,8 @@ def chat(self,
                                                 stream=stream,
                                                 model_params=model_params_dict)
         debug_info = {}
-        if CE_DEBUG_INFO:
-            debug_info['context'] = context.dict()
+        if CANOPY_DEBUG_INFO:
+            debug_info['context'] = context.model_dump()
             debug_info['context'].update(context.debug_info)
 
         if stream:
diff --git a/src/canopy/chat_engine/query_generator/__init__.py b/src/canopy/chat_engine/query_generator/__init__.py
index e91a19e0..332094cc 100644
--- a/src/canopy/chat_engine/query_generator/__init__.py
+++ b/src/canopy/chat_engine/query_generator/__init__.py
@@ -2,3 +2,4 @@
 from .function_calling import FunctionCallingQueryGenerator
 from .last_message import LastMessageQueryGenerator
 from .instruction import InstructionQueryGenerator
+from .cohere import CohereQueryGenerator
diff --git a/src/canopy/chat_engine/query_generator/cohere.py b/src/canopy/chat_engine/query_generator/cohere.py
new file mode 100644
index 00000000..450872ac
--- /dev/null
+++ b/src/canopy/chat_engine/query_generator/cohere.py
@@ -0,0 +1,42 @@
+from typing import List, Optional, cast
+
+from canopy.chat_engine.query_generator import QueryGenerator
+from canopy.chat_engine.history_pruner.raising import RaisingHistoryPruner
+from canopy.llm import BaseLLM, CohereLLM
+from canopy.models.data_models import Messages, Query
+
+
+class CohereQueryGenerator(QueryGenerator):
+    """
+    Query generator for LLM clients that have a built-in feature to
+    generate search queries from chat messages.
+    """
+    _DEFAULT_COMPONENTS = {
+        "llm": CohereLLM,
+    }
+
+    def __init__(self,
+                 *,
+                 llm: Optional[BaseLLM] = None):
+        self._llm = llm or self._DEFAULT_COMPONENTS["llm"]()
+
+        if not isinstance(self._llm, CohereLLM):
+            raise NotImplementedError(
+                "CohereQueryGenerator only compatible with CohereLLM"
+            )
+
+        self._history_pruner = RaisingHistoryPruner()
+
+    def generate(self,
+                 messages: Messages,
+                 max_prompt_tokens: int) -> List[Query]:
+        messages = self._history_pruner.build(chat_history=messages,
+                                              max_tokens=max_prompt_tokens)
+        llm = cast(CohereLLM, self._llm)
+        queries = llm.generate_search_queries(messages)
+        return [Query(text=query) for query in queries]
+
+    async def agenerate(self,
+                        messages: Messages,
+                        max_prompt_tokens: int) -> List[Query]:
+        raise NotImplementedError
diff --git a/config/anyscale.yaml b/src/canopy/config_templates/anyscale.yaml
similarity index 100%
rename from config/anyscale.yaml
rename to src/canopy/config_templates/anyscale.yaml
diff --git a/config/azure.yaml b/src/canopy/config_templates/azure.yaml
similarity index 100%
rename from config/azure.yaml
rename to src/canopy/config_templates/azure.yaml
diff --git a/src/canopy/config_templates/cohere.yaml b/src/canopy/config_templates/cohere.yaml
new file mode 100644
index 00000000..0d0bfaea
--- /dev/null
+++ b/src/canopy/config_templates/cohere.yaml
@@ -0,0 +1,83 @@
+# ==================================================================
+#            Configuration file for Canopy Server with Cohere.
+# ==================================================================
+
+# ---------------------------------------------------------------------------------
+system_prompt: &system_prompt |
+  Use the documents to answer the user question at the next messages. The documents are retrieved from a knowledge
+  database and you should use only the facts from the documents to answer. Always remember to include the source to
+  the documents you used from their 'source' field in the format 'Source: $SOURCE_HERE'.
+  If you don't know the answer, just say that you don't know, don't try to make up an answer, use the documents.
+  Don't address the documents directly, but use them to answer the user question like it's your own knowledge.
+
+
+# -------------------------------------------------------------------------------------------
+# Tokenizer configuration
+# -------------------------------------------------------------------------------------------
+tokenizer:
+  type: CohereHFTokenizer
+  params:
+    model_name: Cohere/Command-nightly
+
+
+# -------------------------------------------------------------------------------------------------------------
+# Chat engine configuration
+# -------------------------------------------------------------------------------------------------------------
+chat_engine:
+  params:
+    system_prompt: *system_prompt
+
+  # -------------------------------------------------------------------------------------------------------------
+  # LLM configuration
+  # -------------------------------------------------------------------------------------------------------------
+  llm: &llm
+    type: CohereLLM
+    params:
+      model_name: command
+      # You can add any additional parameters which are supported by the Cohere Co.Chat API. The values set
+      # here will be used in every Co.Chat API call. For example:
+      # prompt_truncation: "AUTO"
+      # citation_quality: "accurate"
+      # temperature: 0.85
+      # Specifying connectors is contrary to Canopy's purpose of searching the Pinecone knowledge base only,
+      # but technically can still be passed like this:
+      # connectors:
+      #  - "web-search"
+      # Uncomment to suppress errors when unrecognized or unsupported model params are sent to CohereLLM.
+      # ignore_unrecognized_params: true
+
+  # --------------------------------------------------------------------
+  # Configuration for the QueryBuilder subcomponent of the chat engine.
+  # --------------------------------------------------------------------
+  query_builder:
+    type: CohereQueryGenerator
+    params: {}
+    llm:
+      <<: *llm
+
+
+  # -------------------------------------------------------------------------------------------------------------
+  # ContextEngine configuration
+  # -------------------------------------------------------------------------------------------------------------
+  context_engine:
+    # -----------------------------------------------------------------------------------------------------------
+    # KnowledgeBase configuration
+    # -----------------------------------------------------------------------------------------------------------
+    knowledge_base:
+      params:
+        default_top_k: 100
+
+      # --------------------------------------------------------------------------
+      # Configuration for the RecordEncoder subcomponent of the knowledge base.
+      # --------------------------------------------------------------------------
+      record_encoder:
+        type: CohereRecordEncoder
+        params:
+          model_name:                   # The name of the model to use for encoding
+            "embed-english-v3.0"
+          batch_size: 100               # The number of document chunks to encode in each call to the encoding model
+
+      reranker:
+        type: CohereReranker
+        params:
+          top_n: 5
\ No newline at end of file
diff --git a/config/config.yaml b/src/canopy/config_templates/default.yaml
similarity index 99%
rename from config/config.yaml
rename to src/canopy/config_templates/default.yaml
index 83055e0e..b3ea239d 100644
--- a/config/config.yaml
+++ b/src/canopy/config_templates/default.yaml
@@ -112,9 +112,8 @@ chat_engine:
         type: OpenAIRecordEncoder       # Options: [OpenAIRecordEncoder, AzureOpenAIRecordEncoder]
         params:
           model_name:                   # The name of the model to use for encoding
-            text-embedding-ada-002
+            text-embedding-3-small
           batch_size: 400               # The number of document chunks to encode in each call to the encoding model
-
 create_index_params:
   # -------------------------------------------------------------------------------------------
   # Initialization parameters to be passed to create a canopy index. These parameters will
diff --git a/src/canopy/context_engine/context_builder/stuffing.py b/src/canopy/context_engine/context_builder/stuffing.py
index e1cd7c8d..3b2350fc 100644
--- a/src/canopy/context_engine/context_builder/stuffing.py
+++ b/src/canopy/context_engine/context_builder/stuffing.py
@@ -1,7 +1,9 @@
+import json
 from itertools import zip_longest
 from typing import List, Tuple
 
 from pydantic import BaseModel
+from canopy.utils.debugging import CANOPY_DEBUG_INFO
 
 from canopy.context_engine.context_builder.base import ContextBuilder
 from canopy.knowledge_base.models import QueryResult, DocumentWithScore
@@ -22,15 +24,12 @@ class ContextQueryResult(BaseModel):
 
 
 class StuffingContextContent(ContextContent):
-    __root__: List[ContextQueryResult]
-
-    def dict(self, **kwargs):
-        return super().dict(**kwargs)['__root__']
+    root: List[ContextQueryResult]
 
     # In the case of StuffingContextBuilder, we simply want the text representation to
     # be a json. Other ContextContent subclasses may render into text differently
     def to_text(self, **kwargs):
-        return self.json(**kwargs)
+        return json.dumps(self.model_dump(), **kwargs)
 
 
 # ------------- CONTEXT BUILDER -------------
@@ -50,14 +49,15 @@ def build(self,
         context_query_results = [
             ContextQueryResult(query=qr.query, snippets=[])
             for qr in query_results]
-        debug_info = {"num_docs": len(sorted_docs_with_origin)}
-        content = StuffingContextContent(__root__=context_query_results)
+        debug_info = {"num_docs": len(sorted_docs_with_origin), "snippet_ids": []}
+        content = StuffingContextContent(context_query_results)
 
         if self._tokenizer.token_count(content.to_text()) > max_context_tokens:
-            return Context(content=StuffingContextContent(__root__=[]),
+            return Context(content=StuffingContextContent([]),
                            num_tokens=1, debug_info=debug_info)
 
         seen_doc_ids = set()
+        snippet_ids = []
         for doc, origin_query_idx in sorted_docs_with_origin:
             if doc.id not in seen_doc_ids and doc.text.strip() != "":
                 snippet = ContextSnippet(text=doc.text, source=doc.source)
@@ -69,15 +69,19 @@ def build(self,
                 # if the context is too long, remove the snippet
                 if self._tokenizer.token_count(content.to_text()) > max_context_tokens:
                     context_query_results[origin_query_idx].snippets.pop()
+                else:
+                    snippet_ids.append(doc.id)
+
+        debug_info["snippet_ids"] = snippet_ids
 
         # remove queries with no snippets
         content = StuffingContextContent(
-            __root__=[qr for qr in context_query_results if len(qr.snippets) > 0]
+            [qr for qr in context_query_results if len(qr.snippets) > 0]
         )
 
         return Context(content=content,
                        num_tokens=self._tokenizer.token_count(content.to_text()),
-                       debug_info=debug_info)
+                       debug_info=debug_info if CANOPY_DEBUG_INFO else {})
 
     @staticmethod
     def _round_robin_sort(
diff --git a/src/canopy/context_engine/context_engine.py b/src/canopy/context_engine/context_engine.py
index 5603c096..31cd397a 100644
--- a/src/canopy/context_engine/context_engine.py
+++ b/src/canopy/context_engine/context_engine.py
@@ -1,4 +1,3 @@
-import os
 from abc import ABC, abstractmethod
 from typing import List, Optional
 
@@ -8,8 +7,7 @@
 from canopy.knowledge_base.base import BaseKnowledgeBase
 from canopy.models.data_models import Context, Query
 from canopy.utils.config import ConfigurableMixin
-
-CE_DEBUG_INFO = os.getenv("CE_DEBUG_INFO", "FALSE").lower() == "true"
+from canopy.utils.debugging import CANOPY_DEBUG_INFO
 
 
 class BaseContextEngine(ABC, ConfigurableMixin):
@@ -110,8 +108,10 @@ def query(self, queries: List[Query],
             namespace=namespace)
         context = self.context_builder.build(query_results, max_context_tokens)
 
-        if CE_DEBUG_INFO:
-            context.debug_info["query_results"] = [qr.dict() for qr in query_results]
+        if CANOPY_DEBUG_INFO:
+            context.debug_info["query_results"] = [
+                {**qr.model_dump(), **qr.debug_info} for qr in query_results
+            ]
         return context
 
     async def aquery(self, queries: List[Query], max_context_tokens: int,
diff --git a/src/canopy/knowledge_base/chunker/base.py b/src/canopy/knowledge_base/chunker/base.py
index 3b34f8b5..2111676b 100644
--- a/src/canopy/knowledge_base/chunker/base.py
+++ b/src/canopy/knowledge_base/chunker/base.py
@@ -7,7 +7,6 @@
 
 
 class Chunker(ABC, ConfigurableMixin):
-
     """
     Base class for chunkers. Chunkers take a document (id, text, ...)
     and return a list of KBDocChunks  (id, text, document_id, ...)
@@ -57,3 +56,6 @@ def chunk_single_document(self, document: Document) -> List[KBDocChunk]:
     @abstractmethod
     async def achunk_single_document(self, document: Document) -> List[KBDocChunk]:
         raise NotImplementedError()
+
+    def generate_chunk_id(self, document_id: str, chunk_index: int) -> str:
+        return f"{document_id}_{chunk_index}"
diff --git a/src/canopy/knowledge_base/chunker/recursive_character.py b/src/canopy/knowledge_base/chunker/recursive_character.py
index fa1f16af..24019091 100644
--- a/src/canopy/knowledge_base/chunker/recursive_character.py
+++ b/src/canopy/knowledge_base/chunker/recursive_character.py
@@ -52,7 +52,7 @@ def chunk_single_document(self, document: Document) -> List[KBDocChunk]:
         """  # noqa: E501
         # TODO: check overlap not bigger than max_chunk_size
         text_chunks = self._chunker.split_text(document.text)
-        return [KBDocChunk(id=f"{document.id}_{i}",
+        return [KBDocChunk(id=self.generate_chunk_id(document.id, i),
                            document_id=document.id,
                            text=text_chunk,
                            source=document.source,
diff --git a/src/canopy/knowledge_base/chunker/token_chunker.py b/src/canopy/knowledge_base/chunker/token_chunker.py
index 465d9035..9c918ad5 100644
--- a/src/canopy/knowledge_base/chunker/token_chunker.py
+++ b/src/canopy/knowledge_base/chunker/token_chunker.py
@@ -69,7 +69,7 @@ def chunk_single_document(self, document: Document) -> List[KBDocChunk]:
 
         text_chunks = [self._tokenizer.detokenize(chunk)
                        for chunk in token_chunks]
-        return [KBDocChunk(id=f"{document.id}_{i}",
+        return [KBDocChunk(id=self.generate_chunk_id(document.id, i),
                            document_id=document.id,
                            text=text_chunk,
                            source=document.source,
diff --git a/src/canopy/knowledge_base/knowledge_base.py b/src/canopy/knowledge_base/knowledge_base.py
index 2da6239c..f756eb95 100644
--- a/src/canopy/knowledge_base/knowledge_base.py
+++ b/src/canopy/knowledge_base/knowledge_base.py
@@ -5,17 +5,21 @@
 
 from typing import List, Optional, Dict, Any, Union
 from pinecone import (ServerlessSpec, PodSpec,
-                      Pinecone, PineconeApiException)
+                      PineconeApiException)
+
+from canopy.utils.debugging import CANOPY_DEBUG_INFO
 
 try:
-    from pinecone import GRPCIndex as Index
+    from pinecone.grpc import PineconeGRPC as Pinecone
+    from pinecone.grpc import GRPCIndex as Index
 except ImportError:
-    from pinecone import Index
+    from pinecone import Pinecone, Index
 
 from canopy.knowledge_base.base import BaseKnowledgeBase
 from canopy.knowledge_base.chunker import Chunker, MarkdownChunker
 from canopy.knowledge_base.record_encoder import (RecordEncoder,
-                                                  OpenAIRecordEncoder)
+                                                  OpenAIRecordEncoder,
+                                                  HybridRecordEncoder)
 from canopy.knowledge_base.models import (KBQueryResult, KBQuery, QueryResult,
                                           KBDocChunkWithScore, DocumentWithScore)
 from canopy.knowledge_base.reranker import Reranker, TransparentReranker
@@ -257,7 +261,6 @@ def verify_index_connection(self) -> None:
 
     def create_canopy_index(self,
                             spec: Union[Dict, ServerlessSpec, PodSpec] = None,
-                            dimension: Optional[int] = None,
                             metric: Optional[str] = "cosine"
                             ):
         """
@@ -280,9 +283,6 @@ def create_canopy_index(self,
            spec: A dictionary containing configurations describing how the index should be deployed. For serverless indexes,
                  specify region and cloud. For pod indexes, specify replicas, shards, pods, pod_type, metadata_config,
                  and source_collection.
-           dimension: The dimension of the vectors to index.
-                       If `dimension` isn't explicitly provided,
-                       Canopy would try to infer the embedding's dimension based on the configured `Encoder`
            metric: The distance metric to be used for similarity search: 'euclidean', 'cosine', or 'dotproduct'. The
                    default is 'cosine'.
 
@@ -294,22 +294,21 @@ def create_canopy_index(self,
                 region="us-west-2"
             )
 
-        if dimension is None:
-            try:
-                encoder_dimension = self._encoder.dimension
-                if encoder_dimension is None:
-                    raise RuntimeError(
-                        f"The selected encoder {self._encoder.__class__.__name__} does "
-                        f"not support inferring the vectors' dimensionality."
-                    )
-                dimension = encoder_dimension
-            except Exception as e:
+        try:
+            encoder_dimension = self._encoder.dimension
+            if encoder_dimension is None:
                 raise RuntimeError(
-                    f"Canopy has failed to infer vectors' dimensionality using the "
-                    f"selected encoder: {self._encoder.__class__.__name__}. You can "
-                    f"provide the dimension manually, try using a different encoder, or"
-                    f" fix the underlying error:\n{e}"
-                ) from e
+                    f"The selected encoder {self._encoder.__class__.__name__} does "
+                    f"not support inferring the vectors' dimensionality."
+                )
+            dimension = encoder_dimension
+        except Exception as e:
+            raise RuntimeError(
+                f"Canopy has failed to infer vectors' dimensionality using the "
+                f"selected encoder: {self._encoder.__class__.__name__}. You can "
+                f"provide the dimension manually, try using a different encoder, or"
+                f" fix the underlying error:\n{e}"
+            ) from e
 
         if self.index_name in list_canopy_indexes(self._pinecone_client):
             raise RuntimeError(
@@ -318,6 +317,8 @@ def create_canopy_index(self,
                 "If you wish to delete it call `knowledge_base.delete_index()`. "
             )
 
+        self._validate_metric(metric)
+
         try:
             self._pinecone_client.create_index(
                 name=self.index_name,
@@ -352,6 +353,14 @@ def _wait_for_index_provision(self):
                 )
             time.sleep(INDEX_PROVISION_TIME_INTERVAL)
 
+    def _validate_metric(self, metric: Optional[str]):
+        if isinstance(self._encoder, HybridRecordEncoder):
+            if metric != "dotproduct":
+                raise RuntimeError(
+                    "HybridRecordEncoder only supports dotproduct metric. "
+                    "Please set metric='dotproduct' on index creation."
+                )
+
     @staticmethod
     def _get_full_index_name(index_name: str) -> str:
         if index_name.startswith(INDEX_NAME_PREFIX):
@@ -426,20 +435,34 @@ def query(self,
         results = [self._query_index(q,
                                      global_metadata_filter,
                                      namespace) for q in queries]
-        results = self._reranker.rerank(results)
+        ranked_results = self._reranker.rerank(results)
 
+        assert len(results) == len(ranked_results), ("Reranker returned a different"
+                                                     " number of results "
+                                                     "than the number of queries")
         return [
             QueryResult(
-                query=r.query,
+                query=rr.query,
                 documents=[
                     DocumentWithScore(
-                        **d.dict(exclude={
-                            'values', 'sparse_values', 'document_id'
+                        **d.model_dump(exclude={
+                            'document_id'
                         })
                     )
-                    for d in r.documents
-                ]
-            ) for r in results
+                    for d in rr.documents
+                ],
+                debug_info={"db_result": QueryResult(
+                    query=r.query,
+                    documents=[
+                        DocumentWithScore(
+                            **d.model_dump(exclude={
+                                'document_id'
+                            })
+                        )
+                        for d in r.documents
+                    ]
+                ).model_dump()} if CANOPY_DEBUG_INFO else {}
+            ) for rr, r in zip(ranked_results, results)
         ]
 
     def _query_index(self,
diff --git a/src/canopy/knowledge_base/models.py b/src/canopy/knowledge_base/models.py
index 442b371a..cbe827ed 100644
--- a/src/canopy/knowledge_base/models.py
+++ b/src/canopy/knowledge_base/models.py
@@ -1,10 +1,9 @@
 from copy import deepcopy
 from typing import List, Optional
 
-from pinecone_text.sparse import SparseVector
 from pydantic import BaseModel, Field
 
-from canopy.models.data_models import Document, Query
+from canopy.models.data_models import Document, Query, SparseVector
 
 # TODO: (1) consider moving this to pinecone-text
 # TODO: (2) consider renaming to "Vector" or "DenseVector"
@@ -37,7 +36,7 @@ def to_db_record(self):
 
         }
 
-        if self.sparse_values is not None:
+        if self.sparse_values is not None and len(self.sparse_values["values"]) > 0:
             record["sparse_values"] = self.sparse_values
 
         return record
diff --git a/src/canopy/knowledge_base/record_encoder/__init__.py b/src/canopy/knowledge_base/record_encoder/__init__.py
index e90ba496..260953fb 100644
--- a/src/canopy/knowledge_base/record_encoder/__init__.py
+++ b/src/canopy/knowledge_base/record_encoder/__init__.py
@@ -1,7 +1,9 @@
 from .base import RecordEncoder
-from .cohere import CohereEncoder
+from .cohere import CohereRecordEncoder
 from .dense import DenseRecordEncoder
 from .openai import OpenAIRecordEncoder
 from .anyscale import AnyscaleRecordEncoder
 from .azure_openai import AzureOpenAIRecordEncoder
 from .jina import JinaRecordEncoder
+from .sentence_transformers import SentenceTransformerRecordEncoder
+from .hybrid import HybridRecordEncoder
diff --git a/src/canopy/knowledge_base/record_encoder/base.py b/src/canopy/knowledge_base/record_encoder/base.py
index 0f2a1b72..9aadf4f2 100644
--- a/src/canopy/knowledge_base/record_encoder/base.py
+++ b/src/canopy/knowledge_base/record_encoder/base.py
@@ -21,7 +21,7 @@ class RecordEncoder(ABC, ConfigurableMixin):
     - _encode_queries_batch
 
     Async encoders are still not supported, but will be added in the future.
-    """   # noqa: E501
+    """  # noqa: E501
 
     def __init__(self, batch_size: int = 1):
         """
@@ -30,7 +30,7 @@ def __init__(self, batch_size: int = 1):
         Args:
             batch_size: The number of documents or queries to encode at once.
                         Defaults to 1.
-        """   # noqa: E501
+        """  # noqa: E501
         self.batch_size = batch_size
 
     # TODO: rename documents to doc_chunks or chunks
@@ -47,7 +47,7 @@ def _encode_documents_batch(self,
 
         Returns:
             encoded chunks: A list of KBEncodedDocChunk.
-        """   # noqa: E501
+        """  # noqa: E501
         pass
 
     @abstractmethod
@@ -61,7 +61,7 @@ def _encode_queries_batch(self, queries: List[Query]) -> List[KBQuery]:
 
         Returns:
             encoded queries: A list of KBQuery.
-        """   # noqa: E501
+        """  # noqa: E501
         pass
 
     @abstractmethod
@@ -78,14 +78,6 @@ async def _aencode_queries_batch(self, queries: List[Query]) -> List[KBQuery]:
     def _batch_iterator(data: list, batch_size):
         return (data[pos:pos + batch_size] for pos in range(0, len(data), batch_size))
 
-    @property
-    def dimension(self) -> Optional[int]:
-        """
-        Returns:
-            The dimension of the dense vectors produced by the encoder, if applicable.
-        """   # noqa: E501
-        return None
-
     def encode_documents(self, documents: List[KBDocChunk]) -> List[KBEncodedDocChunk]:
         """
 
@@ -97,7 +89,7 @@ def encode_documents(self, documents: List[KBDocChunk]) -> List[KBEncodedDocChun
         Returns:
             encoded chunks: A list of KBEncodedDocChunk.
 
-        """   # noqa: E501
+        """  # noqa: E501
         encoded_docs = []
         for batch in self._batch_iterator(documents, self.batch_size):
             try:
@@ -120,7 +112,7 @@ def encode_queries(self, queries: List[Query]) -> List[KBQuery]:
 
         Returns:
             encoded queries: A list of KBQuery.
-        """   # noqa: E501
+        """  # noqa: E501
 
         kb_queries = []
         for batch in self._batch_iterator(queries, self.batch_size):
@@ -152,3 +144,11 @@ async def aencode_queries(self, queries: List[Query]) -> List[KBQuery]:
 
     def _format_error(self, err):
         return f"{err}"
+
+    @property
+    def dimension(self) -> Optional[int]:
+        """
+        Returns:
+            The dimension of the dense vectors produced by the encoder, if applicable.
+        """  # noqa: E501
+        return None
diff --git a/src/canopy/knowledge_base/record_encoder/dense.py b/src/canopy/knowledge_base/record_encoder/dense.py
index 55ab231b..8ad78e36 100644
--- a/src/canopy/knowledge_base/record_encoder/dense.py
+++ b/src/canopy/knowledge_base/record_encoder/dense.py
@@ -1,5 +1,5 @@
-from typing import List
 from functools import cached_property
+from typing import List
 from pinecone_text.dense.base_dense_ecoder import BaseDenseEncoder
 
 from .base import RecordEncoder
@@ -10,7 +10,7 @@
 class DenseRecordEncoder(RecordEncoder):
     """
     DenseRecordEncoder is a subclass of RecordEncoder that generates dense vector representation of documents chunks and textual queries.
-    The dense represntation generated by the `DenseRecordEncoder` is a list of floats in a given dimension.
+    The dense representation generated by the `DenseRecordEncoder` is a list of floats in a given dimension.
     DenseRecordEncoder wraps a BaseDenseEncoder from the `pinecone-text` library to encode the text itself.
     for more information about the BaseDenseEncoder see: https://github.com/pinecone-io/pinecone-text
     """  # noqa: E501
@@ -40,7 +40,7 @@ def _encode_documents_batch(self,
             encoded chunks: A list of KBEncodedDocChunk, with the `values` field populated by the generated embeddings vector.
         """  # noqa: E501
         dense_values = self._dense_encoder.encode_documents([d.text for d in documents])
-        return [KBEncodedDocChunk(**d.dict(), values=v) for d, v in
+        return [KBEncodedDocChunk(**d.model_dump(), values=v) for d, v in
                 zip(documents, dense_values)]
 
     def _encode_queries_batch(self, queries: List[Query]) -> List[KBQuery]:
@@ -52,20 +52,19 @@ def _encode_queries_batch(self, queries: List[Query]) -> List[KBQuery]:
             encoded queries: A list of KBQuery, with the `values` field populated by the generated embeddings vector.
         """  # noqa: E501
         dense_values = self._dense_encoder.encode_queries([q.text for q in queries])
-        return [KBQuery(**q.dict(), values=v) for q, v in zip(queries, dense_values)]
+        return [
+            KBQuery(**q.model_dump(), values=v) for q, v in zip(queries, dense_values)
+        ]
 
     @cached_property
     def dimension(self) -> int:
         """
         The dimension is the length of the vector generated by the `DenseRecordEncoder`
-        Canopy will run a single word through the encoder to get the dimension, this will also validate that the encoder
-        is working properly.
 
         Returns:
             dimension(int): the dimension of the encoder
         """  # noqa: E501
-        dummy_doc = KBDocChunk(text="hello", id="dummy_doc", document_id="dummy_doc")
-        return len(self.encode_documents([dummy_doc])[0].values)
+        return self._dense_encoder.dimension
 
     async def _aencode_documents_batch(self,
                                        documents: List[KBDocChunk]
diff --git a/src/canopy/knowledge_base/record_encoder/hybrid.py b/src/canopy/knowledge_base/record_encoder/hybrid.py
new file mode 100644
index 00000000..0fa5ff8a
--- /dev/null
+++ b/src/canopy/knowledge_base/record_encoder/hybrid.py
@@ -0,0 +1,140 @@
+import logging
+from functools import cached_property
+from typing import List, Optional
+
+from pinecone_text.hybrid import hybrid_convex_scale
+from pinecone_text.sparse import BM25Encoder
+
+from . import DenseRecordEncoder, OpenAIRecordEncoder
+from .base import RecordEncoder
+from canopy.knowledge_base.models import KBQuery, KBEncodedDocChunk, KBDocChunk
+from canopy.models.data_models import Query
+
+logger = logging.getLogger(__name__)
+
+
+class HybridRecordEncoder(RecordEncoder):
+    """
+    HybridRecordEncoder is a subclass of RecordEncoder that generates sparse and dense vector representation of
+    documents` chunks and textual queries.
+
+    The dense representation generated by the `HybridRecordEncoder` is a list of floats in a given dimension.
+    The sparse representation generated by the `HybridRecordEncoder` is a `SparseVector`.
+
+    HybridRecordEncoder uses DenseRecordEncoder for dense encoding and BM25Encoder for sparse encoding.
+
+    Alpha is a parameter that controls the weight of the dense vector in the hybrid representation.
+    If alpha is 1, the query vector will be the dense vector. The default value of alpha is 0.5.
+
+    For more information about the encoders see: https://github.com/pinecone-io/pinecone-text
+
+    """  # noqa: E501
+
+    _DEFAULT_COMPONENTS = {
+        "dense_record_encoder": OpenAIRecordEncoder
+    }
+
+    def __init__(self,
+                 dense_record_encoder: Optional[DenseRecordEncoder] = None,
+                 alpha: float = 0.5,
+                 bm_25_encoder_df_path: Optional[str] = None,
+                 **kwargs):
+        """
+        Initialize the encoder.
+
+        Args:
+            dense_record_encoder: A DenseRecordEncoder to encode the text.
+            alpha: The weight of the dense vector in the hybrid representation (between 0 and 1).
+            bm_25_encoder_df_path: The path to the file that contains the document frequencies of the BM25Encoder.\
+            You can create this file by fitting the BM25Encoder on a corpus of documents and calling `dump`\
+            on the encoder.
+            **kwargs: Additional arguments to pass to the RecordEncoder.
+        """  # noqa: E501
+
+        if alpha == 0:
+            raise ValueError("Sparse only representation is not supported. "
+                             "Alpha must be greater than 0.")
+
+        if not 0 < alpha <= 1:
+            raise ValueError("Alpha must be between 0 (excluded) and 1 (included)")
+
+        super().__init__(**kwargs)
+
+        if dense_record_encoder:
+            if not isinstance(dense_record_encoder, DenseRecordEncoder):
+                raise TypeError(
+                    f"dense_encoder must be an instance of DenseRecordEncoder, "
+                    f"not {type(dense_record_encoder)}"
+                )
+            self._dense_record_encoder = dense_record_encoder
+        else:
+            default_dense_encoder = self._DEFAULT_COMPONENTS["dense_record_encoder"]
+            self._dense_record_encoder = default_dense_encoder()
+
+        self._bm_25_encoder_df_path = bm_25_encoder_df_path
+        self._alpha = alpha
+
+    @cached_property
+    def _sparse_encoder(self) -> BM25Encoder:
+        logger.info("Loading the document frequencies for the BM25Encoder...")
+        if self._bm_25_encoder_df_path is None:
+            encoder = BM25Encoder.default()
+        else:
+            encoder = BM25Encoder().load(self._bm_25_encoder_df_path)
+        logger.info("Finished loading the document frequencies for the BM25Encoder.")
+        return encoder
+
+    def _encode_documents_batch(self,
+                                documents: List[KBDocChunk]
+                                ) -> List[KBEncodedDocChunk]:
+        """
+        Encode a batch of documents, takes a list of KBDocChunk and returns a list of KBEncodedDocChunk.
+
+        Args:
+            documents: A list of KBDocChunk to encode.
+        Returns:
+            encoded chunks: A list of KBEncodedDocChunk,
+            with the `values` containing the generated dense vector and
+            `sparse_values` containing the generated sparse vector.
+        """  # noqa: E501
+
+        chunks = self._dense_record_encoder.encode_documents(documents)
+        sparse_values = self._sparse_encoder.encode_documents(
+            [d.text for d in documents]
+        )
+        for chunk, sv in zip(chunks, sparse_values):
+            chunk.sparse_values = sv
+        return chunks
+
+    def _encode_queries_batch(self, queries: List[Query]) -> List[KBQuery]:
+        """
+        Encode a batch of queries, takes a list of Query and returns a list of KBQuery.
+        Args:
+            queries: A list of Query to encode.
+        Returns:
+            encoded queries: A list of KBQuery, with the `values` containing the generated dense vector with the weight
+            alpha and `sparse_values` containing the generated sparse vector with the weight (1 - alpha).
+        """  # noqa: E501
+
+        dense_queries = self._dense_record_encoder.encode_queries(queries)
+        sparse_values = self._sparse_encoder.encode_queries([q.text for q in queries])
+
+        scaled_values = [
+            hybrid_convex_scale(dq.values, sv, self._alpha) for dq, sv in
+            zip(dense_queries, sparse_values)
+        ]
+
+        return [q.model_copy(update=dict(values=v, sparse_values=sv)) for q, (v, sv) in
+                zip(dense_queries, scaled_values)]
+
+    @property
+    def dimension(self) -> int:
+        return self._dense_record_encoder.dimension
+
+    async def _aencode_documents_batch(self,
+                                       documents: List[KBDocChunk]
+                                       ) -> List[KBEncodedDocChunk]:
+        raise NotImplementedError
+
+    async def _aencode_queries_batch(self, queries: List[Query]) -> List[KBQuery]:
+        raise NotImplementedError
diff --git a/src/canopy/knowledge_base/record_encoder/openai.py b/src/canopy/knowledge_base/record_encoder/openai.py
index 03f1eb2a..0ccdd3b6 100644
--- a/src/canopy/knowledge_base/record_encoder/openai.py
+++ b/src/canopy/knowledge_base/record_encoder/openai.py
@@ -1,4 +1,4 @@
-from typing import List
+from typing import List, Optional
 
 from openai import OpenAIError, RateLimitError, APIConnectionError, AuthenticationError
 from pinecone_text.dense.openai_encoder import OpenAIEncoder
@@ -18,8 +18,9 @@ class OpenAIRecordEncoder(DenseRecordEncoder):
     def __init__(
         self,
         *,
-        model_name: str = "text-embedding-ada-002",
+        model_name: str = "text-embedding-3-small",
         batch_size: int = 400,
+        dimension: Optional[int] = None,
         **kwargs
     ):
         """
@@ -29,10 +30,11 @@ def __init__(
             model_name: The name of the OpenAI embeddings model to use for encoding. See https://platform.openai.com/docs/models/embeddings
             batch_size: The number of documents or queries to encode at once.
                         Defaults to 400.
+            dimension: The dimension of the embeddings vector to generate.
             **kwargs: Additional arguments to pass to the underlying `pinecone-text. OpenAIEncoder`.
         """  # noqa: E501
         try:
-            encoder = OpenAIEncoder(model_name, **kwargs)
+            encoder = OpenAIEncoder(model_name, dimension=dimension, **kwargs)
         except OpenAIError as e:
             raise RuntimeError(
                 "Failed to connect to OpenAI, please make sure that the OPENAI_API_KEY "
diff --git a/src/canopy/knowledge_base/record_encoder/sentence_transformers.py b/src/canopy/knowledge_base/record_encoder/sentence_transformers.py
new file mode 100644
index 00000000..b15fcdb8
--- /dev/null
+++ b/src/canopy/knowledge_base/record_encoder/sentence_transformers.py
@@ -0,0 +1,57 @@
+from typing import Optional
+from pinecone_text.dense import SentenceTransformerEncoder
+from canopy.knowledge_base.record_encoder.dense import DenseRecordEncoder
+from huggingface_hub.utils import RepositoryNotFoundError
+
+
+class SentenceTransformerRecordEncoder(DenseRecordEncoder):
+    """
+    SentenceTransformerRecordEncoder is a type of DenseRecordEncoder that uses a Sentence Transformer model.
+    The implementation uses the `SentenceTransformerEncoder` class from the `pinecone-text` library.
+    For more information about see: https://github.com/pinecone-io/pinecone-text
+
+    """  # noqa: E501
+
+    def __init__(self,
+                 *,
+                 model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
+                 query_encoder_name: Optional[str] = None,
+                 batch_size: int = 400,
+                 device: Optional[str] = None,
+                 **kwargs) -> None:
+        """
+        Initialize the SentenceTransformerRecordEncoder
+
+        Args:
+            model_name: The name of the embedding model to use for encoding documents.
+                        See https://huggingface.co/models?library=sentence-transformers
+                        for all possible Sentence Transformer models.
+            query_encoder_name: The name of the embedding model to use for encoding queries.
+                        See https://huggingface.co/models?library=sentence-transformers
+                        for all possible Sentence Transformer models.
+                        Defaults to `model_name`.
+            batch_size: The number of documents or queries to encode at once.
+                        Defaults to 400.
+            device: The local device to use for encoding, for example "cpu", "cuda" or "mps".
+                        Defaults to "cuda" if cuda is available, otherwise to "cpu".
+            **kwargs: Additional arguments to pass to the underlying `pinecone-text.SentenceTransformerEncoder`.
+        """  # noqa: E501
+        try:
+            encoder = SentenceTransformerEncoder(
+                document_encoder_name=model_name,
+                query_encoder_name=query_encoder_name,
+                device=device,
+                **kwargs,
+            )
+        except RepositoryNotFoundError as e:
+            raise RuntimeError(
+                "Your chosen Sentence Transformer model(s) could not be found. "
+                f"Details: {str(e)}"
+            ) from e
+        except ImportError:
+            raise ImportError(
+                f"{self.__class__.__name__} requires the `torch` and `transformers` "
+                f"extra dependencies. Please install them using "
+                f"`pip install canopy-sdk[torch,transformers]`."
+            )
+        super().__init__(dense_encoder=encoder, batch_size=batch_size)
diff --git a/src/canopy/knowledge_base/reranker/__init__.py b/src/canopy/knowledge_base/reranker/__init__.py
index a57a57c3..91b83e44 100644
--- a/src/canopy/knowledge_base/reranker/__init__.py
+++ b/src/canopy/knowledge_base/reranker/__init__.py
@@ -1 +1,3 @@
-from .reranker import TransparentReranker, Reranker
+from .reranker import Reranker
+from .transparent import TransparentReranker
+from .cohere import CohereReranker
diff --git a/src/canopy/knowledge_base/reranker/cohere.py b/src/canopy/knowledge_base/reranker/cohere.py
new file mode 100644
index 00000000..615581cb
--- /dev/null
+++ b/src/canopy/knowledge_base/reranker/cohere.py
@@ -0,0 +1,84 @@
+import os
+from typing import List, Optional
+
+
+from canopy.knowledge_base.models import KBQueryResult
+from canopy.knowledge_base.reranker import Reranker
+
+try:
+    import cohere
+    from cohere import CohereAPIError
+except (OSError, ImportError, ModuleNotFoundError):
+    _cohere_installed = False
+else:
+    _cohere_installed = True
+
+
+class CohereReranker(Reranker):
+    """
+    Reranker that uses Cohere's text embedding to rerank documents.
+
+    For each query and documents returned for that query, returns a list
+    of documents ordered by their relevance to the provided query.
+    """
+
+    def __init__(self,
+                 model_name: str = 'rerank-english-v2.0',
+                 *,
+                 top_n: int = 10,
+                 api_key: Optional[str] = None):
+        """
+            Initializes the Cohere reranker.
+
+            Args:
+                model_name: The identifier of the model to use, one of :
+                    rerank-english-v2.0, rerank-multilingual-v2.0
+                top_n: The number of most relevant documents return, defaults to 10
+                api_key: API key for Cohere. If not passed `CO_API_KEY` environment
+                    variable will be used.
+        """
+
+        if not _cohere_installed:
+            raise ImportError(
+                "Failed to import cohere. Make sure you install cohere extra "
+                "dependencies by running: "
+                "pip install canopy-sdk[cohere]"
+            )
+        cohere_api_key = api_key or os.environ.get("CO_API_KEY")
+        if cohere_api_key is None:
+            raise RuntimeError(
+                "Cohere API key is required to use Cohere Reranker. "
+                "Please provide it as an argument "
+                "or set the CO_API_KEY environment variable."
+            )
+        self._client = cohere.Client(api_key=cohere_api_key)
+        self._model_name = model_name
+        self._top_n = top_n
+
+    def rerank(self, results: List[KBQueryResult]) -> List[KBQueryResult]:
+        reranked_query_results: List[KBQueryResult] = []
+        for result in results:
+            texts = [doc.text for doc in result.documents]
+            try:
+                response = self._client.rerank(query=result.query,
+                                               documents=texts,
+                                               top_n=self._top_n,
+                                               model=self._model_name)
+            except CohereAPIError as e:
+                raise RuntimeError("Failed to rerank documents using Cohere."
+                                   f" Underlying Error:\n{e.message}")
+
+            reranked_docs = []
+            for rerank_result in response:
+                doc = result.documents[rerank_result.index].model_copy(
+                    deep=True,
+                    update=dict(score=rerank_result.relevance_score)
+                )
+                reranked_docs.append(doc)
+
+            reranked_query_results.append(KBQueryResult(query=result.query,
+                                                        documents=reranked_docs))
+        return reranked_query_results
+
+    async def arerank(self, results: List[KBQueryResult]) -> List[KBQueryResult]:
+        raise NotImplementedError()
diff --git a/src/canopy/knowledge_base/reranker/reranker.py b/src/canopy/knowledge_base/reranker/reranker.py
index b4e4f918..151449f2 100644
--- a/src/canopy/knowledge_base/reranker/reranker.py
+++ b/src/canopy/knowledge_base/reranker/reranker.py
@@ -19,24 +19,3 @@ def rerank(self, results: List[KBQueryResult]) -> List[KBQueryResult]:
     @abstractmethod
     async def arerank(self, results: List[KBQueryResult]) -> List[KBQueryResult]:
         pass
-
-
-class TransparentReranker(Reranker):
-    """
-    Transparent reranker that does nothing, it just returns the results as is. This is the default reranker.
-    The TransparentReranker is used as a placeholder for future development "forcing" every result set to be reranked.
-    """  # noqa: E501
-    def rerank(self, results: List[KBQueryResult]) -> List[KBQueryResult]:
-        """
-        Returns the results as is.
-
-        Args:
-            results: A list of KBQueryResult to rerank.
-
-        Returns:
-            results: A list of KBQueryResult, same as the input.
-        """  # noqa: E501
-        return results
-
-    async def arerank(self, results: List[KBQueryResult]) -> List[KBQueryResult]:
-        return results
diff --git a/src/canopy/knowledge_base/reranker/transparent.py b/src/canopy/knowledge_base/reranker/transparent.py
new file mode 100644
index 00000000..d9c5ed49
--- /dev/null
+++ b/src/canopy/knowledge_base/reranker/transparent.py
@@ -0,0 +1,26 @@
+from typing import List
+
+from canopy.knowledge_base.models import KBQueryResult
+from canopy.knowledge_base.reranker import Reranker
+
+
+class TransparentReranker(Reranker):
+    """
+    Transparent reranker that does nothing, it just returns the results as is. This is the default reranker.
+    The TransparentReranker is used as a placeholder for future development "forcing" every result set to be reranked.
+    """  # noqa: E501
+
+    def rerank(self, results: List[KBQueryResult]) -> List[KBQueryResult]:
+        """
+        Returns the results as is.
+
+        Args:
+            results: A list of KBQueryResult to rerank.
+
+        Returns:
+            results: A list of KBQueryResult, same as the input.
+        """  # noqa: E501
+        return results
+
+    async def arerank(self, results: List[KBQueryResult]) -> List[KBQueryResult]:
+        return results
diff --git a/src/canopy/llm/__init__.py b/src/canopy/llm/__init__.py
index 96d363a5..0e34502d 100644
--- a/src/canopy/llm/__init__.py
+++ b/src/canopy/llm/__init__.py
@@ -2,3 +2,4 @@
 from .openai import OpenAILLM
 from .anyscale import AnyscaleLLM
 from .azure_openai_llm import AzureOpenAILLM
+from .cohere import CohereLLM
diff --git a/src/canopy/llm/cohere.py b/src/canopy/llm/cohere.py
new file mode 100644
index 00000000..cb55e75b
--- /dev/null
+++ b/src/canopy/llm/cohere.py
@@ -0,0 +1,403 @@
+import time
+from copy import deepcopy
+from typing import Union, Iterable, Optional, Any, Dict, List
+
+from tenacity import retry, stop_after_attempt
+
+try:
+    import cohere
+except (OSError, ImportError, ModuleNotFoundError):
+    _cohere_installed = False
+else:
+    _cohere_installed = True
+
+from canopy.llm import BaseLLM
+from canopy.llm.models import Function
+from canopy.models.api_models import (
+    _Choice,
+    _StreamChoice,
+    ChatResponse,
+    StreamingChatChunk,
+    TokenCounts,
+)
+from canopy.models.data_models import Context, MessageBase, Messages, Role, Query
+from canopy.context_engine.context_builder.stuffing import StuffingContextContent
+
+
+COMMON_PARAMS = {
+    "model",
+    "frequency_penalty",
+    "logit_bias",
+    "max_tokens",
+    "presence_penalty",
+    "stream",
+    "temperature",
+}
+
+
+EQUIVALENT_PARAMS = {
+    "top_p": "p",
+    "user": "user_name",
+}
+
+
+class CohereLLM(BaseLLM):
+    """
+    Cohere LLM wrapper built on top of the Cohere Python client.
+
+    Note: Cohere requires a valid API key to use this class.
+          You can set the "CO_API_KEY" environment variable to your API key.
+    """
+    def __init__(self,
+                 model_name: str = "command",
+                 *,
+                 api_key: Optional[str] = None,
+                 base_url: Optional[str] = None,
+                 ignore_unrecognized_params: Optional[bool] = False,
+                 **kwargs: Any,
+                 ):
+        """
+        Initialize the Cohere LLM.
+
+        Args:
+            model_name: The name of the model to use. See https://docs.cohere.com/docs/models
+            api_key: Your Cohere API key. Defaults to None (uses the "CO_API_KEY" environment variable).
+            base_url: The base URL to use for the Cohere API. Defaults to None (uses the "CO_API_URL" environment variable if set, otherwise use default Cohere API URL).
+            ignore_unrecognized_params: Flag to suppress errors when unrecognized model params (from other LLMs) are passed to Cohere.
+            **kwargs: Generation default parameters to use for each request. See https://platform.openai.com/docs/api-reference/chat/create
+                    For example, you can set the temperature, p, etc
+                    These params can be overridden by passing a `model_params` argument to the `chat_completion` methods.
+        """  # noqa: E501
+        super().__init__(model_name)
+
+        if not _cohere_installed:
+            raise ImportError(
+                "Failed to import cohere. Make sure you install cohere extra "
+                "dependencies by running: "
+                "pip install canopy-sdk[cohere]"
+            )
+
+        try:
+            self._client = cohere.Client(api_key, api_url=base_url)
+        except cohere.error.CohereError as e:
+            raise RuntimeError(
+                "Failed to connect to Cohere, please make sure that the CO_API_KEY "
+                "environment variable is set correctly.\n"
+                f"Error: {e.message}"
+            )
+
+        self.ignore_unrecognized_params = ignore_unrecognized_params
+        self.default_model_params = kwargs
+
+    def chat_completion(self,
+                        system_prompt: str,
+                        chat_history: Messages,
+                        context: Optional[Context] = None,
+                        *,
+                        stream: bool = False,
+                        max_tokens: Optional[int] = None,
+                        model_params: Optional[dict] = None,
+                        ) -> Union[ChatResponse, Iterable[StreamingChatChunk]]:
+        """
+        Chat completion using the Cohere API.
+
+        Note: this function is wrapped in a retry decorator to handle transient errors.
+
+        Args:
+            system_prompt: The system prompt to use for the chat completion (preamble).
+            chat_history: Messages (chat history) to send to the model.
+            context: Knowledge base context to use for the chat completion. Defaults to None (no context).
+            stream: Whether to stream the response or not.
+            max_tokens: Maximum number of tokens to generate. Defaults to None (generates until stop sequence or until hitting max context size).
+            model_params: Model parameters to use for this request. Defaults to None (uses the default model parameters).
+                          Dictonary of parametrs to override the default model parameters if set on initialization.
+                          For example, you can pass: {"temperature": 0.9, "top_p": 1.0} to override the default temperature and top_p.
+                          see: https://platform.openai.com/docs/api-reference/chat/create
+        Returns:
+            ChatResponse or StreamingChatChunk
+
+        Usage:
+            >>> from canopy.llm import OpenAILLM
+            >>> from canopy.models.data_models import UserMessage
+            >>> llm = CohereLLM()
+            >>> messages = [UserMessage(content="Hello! How are you?")]
+            >>> result = llm.chat_completion(messages)
+            >>> print(result.choices[0].message.content)
+            "I'm good, how are you?"
+        """  # noqa: E501
+        model_params_dict: Dict[str, Any] = deepcopy(self.default_model_params)
+        model_params_dict.update(
+            model_params or {}
+        )
+        model_params_dict["max_tokens"] = max_tokens
+
+        model_params_dict = self._convert_model_params(model_params_dict)
+
+        connectors = model_params_dict.pop('connectors', None)
+        messages: List[Dict[str, Any]] = self._map_messages(chat_history)
+        model_name = model_params_dict.pop('model', None) or self.model_name
+
+        if not messages:
+            raise RuntimeError("No message provided")
+
+        if system_prompt:
+            messages = self._prepend_system_prompt_to_messages(system_prompt, messages)
+
+        try:
+            response = self._client.chat(
+                model=model_name,
+                message=messages.pop()['message'],
+                chat_history=messages,
+                documents=self.generate_documents_from_context(context),
+                preamble_override=None,
+                stream=stream,
+                connectors=[
+                    {"id": connector} for connector in connectors
+                ] if connectors else None,
+                **model_params_dict
+            )
+        except cohere.error.CohereAPIError as e:
+            raise RuntimeError(
+                f"Failed to use Cohere's {model_name} model for chat "
+                f"completion. "
+                f"Underlying Error:\n{e.message}"
+            )
+
+        def streaming_iterator(res):
+            for chunk in res:
+                if chunk.event_type != "text-generation":
+                    continue
+
+                choice = _StreamChoice(
+                    index=0,
+                    delta={
+                        "content": chunk.text,
+                        "function_call": None,
+                        "role": Role.ASSISTANT,
+                        "tool_calls": None
+                    },
+                    finish_reason=None,
+                )
+                streaming_chat_chunk = StreamingChatChunk(
+                    id='',
+                    object="chat.completion.chunk",
+                    created=int(time.time()),
+                    model=self.model_name,
+                    choices=[choice],
+                )
+                streaming_chat_chunk.id = chunk.id
+
+                yield streaming_chat_chunk
+
+        if stream:
+            return streaming_iterator(response)
+
+        return ChatResponse(
+            id=response.id,
+            created=int(time.time()),
+            choices=[_Choice(
+                index=0,
+                message=MessageBase(
+                    role=Role.ASSISTANT,
+                    content=response.text,
+                ),
+                finish_reason="stop",
+            )],
+            object="chat.completion",
+            model=self.model_name,
+            usage=TokenCounts(
+                prompt_tokens=response.token_count["prompt_tokens"],
+                completion_tokens=response.token_count["response_tokens"],
+                total_tokens=response.token_count["billed_tokens"],
+            ),
+        )
+
+    @retry(
+        reraise=True,
+        stop=stop_after_attempt(3),
+    )
+    def generate_search_queries(self, messages):
+        messages = self._map_messages(messages)
+        response = self._client.chat(
+            model=self.model_name,
+            message=messages[-1]['message'],
+            chat_history=messages[:-1],
+            stream=False,
+            search_queries_only=True,
+        )
+        return [search_query['text'] for search_query in response.search_queries]
+
+    def enforced_function_call(self,
+                               system_prompt: str,
+                               chat_history: Messages,
+                               function: Function,
+                               *,
+                               max_tokens: Optional[int] = None,
+                               model_params: Optional[dict] = None
+                               ) -> dict:
+        raise NotImplementedError("Cohere LLM doesn't support function calling")
+
+    async def aenforced_function_call(self,
+                                      system_prompt: str,
+                                      chat_history: Messages,
+                                      function: Function, *,
+                                      max_tokens: Optional[int] = None,
+                                      model_params: Optional[dict] = None):
+        raise NotImplementedError("Cohere LLM doesn't support function calling")
+
+    async def achat_completion(self,
+                               system_prompt: str,
+                               chat_history: Messages,
+                               context: Optional[Context] = None,
+                               *,
+                               stream: bool = False,
+                               max_generated_tokens: Optional[int] = None,
+                               model_params: Optional[dict] = None,
+                               ) -> Union[ChatResponse,
+                                          Iterable[StreamingChatChunk]]:
+        raise NotImplementedError("Cohere LLM doesn't support async chat completion")
+
+    async def agenerate_queries(self,
+                                messages: Messages,
+                                *,
+                                max_generated_tokens: Optional[int] = None,
+                                model_params: Optional[dict] = None,
+                                ) -> List[Query]:
+        raise NotImplementedError("Cohere LLM doesn't support async query generation")
+
+    def _convert_model_params(self, openai_model_params: dict) -> dict:
+        """
+        Convert Open AI model params to Cohere equivalents.
+
+        Args:
+            openai_model_params: model params passed from client to Canopy API in OpenAI format.
+
+        Returns:
+            Model params used with Cohere Chat API.
+        """  # noqa: E501
+        converted_model_params = {}
+
+        for param in list(openai_model_params.keys()):
+            if param in COMMON_PARAMS:
+                converted_model_params[param] = openai_model_params.pop(param)
+            elif param in EQUIVALENT_PARAMS:
+                converted_model_params[EQUIVALENT_PARAMS[param]] = \
+                    openai_model_params.pop(param)
+
+        # Scale is -2.0 to 2.0 with OpenAI, but -1.0 to 1.0 with Cohere.
+        if presence_penalty := converted_model_params.get("presence_penalty"):
+            converted_model_params = presence_penalty * 0.5
+
+        unrecognized_keys = set(openai_model_params.keys())
+        default_keys = set(self.default_model_params.keys())
+
+        if unrecognized_keys.difference(default_keys) \
+                and not self.ignore_unrecognized_params:
+            raise NotImplementedError(
+                f"{','.join(unrecognized_keys)} not supported by Cohere Chat API."
+            )
+
+        return converted_model_params
+
+    def _map_messages(self, messages: Messages) -> List[dict[str, Any]]:
+        """
+        Map the messages to format expected by Cohere.
+
+        Cohere Chat API expects message history to be in the format:
+        {
+          "role": "USER",
+          "message": "message text"
+        }
+
+        System messages will be passed as user messages.
+
+        Args:
+            messages: (chat history) to send to the model.
+
+        Returns:
+            list A List of dicts in format expected by Cohere chat API.
+        """
+        mapped_messages = []
+
+        for message in messages:
+            if not message.content:
+                continue
+
+            mapped_messages.append({
+                "role": "CHATBOT" if message.role == Role.ASSISTANT else "USER",
+                "message": message.content,
+            })
+
+        return mapped_messages
+
+    def _prepend_system_prompt_to_messages(self,
+                                           system_prompt: str,
+                                           messages: List[dict[str, Any]]) -> (
+                                                List)[dict[str, Any]]:
+        """
+        Prepend the value passed as the system prompt to the messages.
+
+        Cohere does not have a direct equivalent to the system prompt, and when passing
+        documents it's preferred to send the system prompt as the first message instead.
+        """
+        first_message = messages[0]
+
+        if (first_message["message"] == system_prompt
+                and first_message["role"] == "USER"):
+            return messages
+
+        system_prompt_messages = [
+            {
+                "role": "USER",
+                "message": system_prompt,
+            },
+            {
+                "role": "CHATBOT",
+                "message": "Ok."
+            }
+        ]
+
+        return system_prompt_messages + messages
+
+    def generate_documents_from_context(
+            self, context: Optional[Context]) -> List[Dict[str, Any]]:
+        """
+        Generate document data to pass to Cohere Chat API from provided context data.
+
+        Args:
+            context: Knowledge base context to use for the chat completion.
+
+        Returns:
+            documents: list of document objects for Cohere API.
+        """
+        if not context:
+            return []
+
+        if isinstance(context.content, StuffingContextContent):
+            return (
+                self.generate_documents_from_stuffing_context_content(context.content)
+            )
+
+        raise NotImplementedError(
+            "Cohere LLM is currently supported only with StuffingContextBuilder."
+        )
+
+    def generate_documents_from_stuffing_context_content(
+            self,
+            content: StuffingContextContent) -> List[Dict[str, Any]]:
+        """
+        Generate document data to pass to Cohere Chat API from StuffingContextContent.
+
+        Args:
+            content: Stuffing context content from the context.
+
+        Returns:
+            documents: list of document objects for Cohere API.
+        """
+        documents = []
+
+        for result in content.root:
+            for snippet in result.snippets:
+                documents.append(snippet.model_dump())
+
+        return documents
diff --git a/src/canopy/llm/models.py b/src/canopy/llm/models.py
index ed0c33a2..fe65997b 100644
--- a/src/canopy/llm/models.py
+++ b/src/canopy/llm/models.py
@@ -1,6 +1,6 @@
 from typing import Optional, List, Union
 
-from pydantic import BaseModel
+from pydantic import BaseModel, model_serializer
 
 
 class FunctionPrimitiveProperty(BaseModel):
@@ -17,8 +17,8 @@ class FunctionArrayProperty(BaseModel):
     # because the model is more struggling with them
     description: str
 
-    def dict(self, *args, **kwargs):
-        super_dict = super().dict(*args, **kwargs)
+    def model_dump(self, *args, **kwargs):
+        super_dict = super().model_dump(*args, **kwargs)
         if "items_type" in super_dict:
             super_dict["type"] = "array"
             super_dict["items"] = {"type": super_dict.pop("items_type")}
@@ -32,11 +32,12 @@ class FunctionParameters(BaseModel):
     required_properties: List[FunctionProperty]
     optional_properties: List[FunctionProperty] = []
 
-    def dict(self, *args, **kwargs):
+    @model_serializer()
+    def serialize_model(self):
         return {
             "type": "object",
             "properties": {
-                pro.name: pro.dict(exclude_none=True, exclude={"name"})
+                pro.name: pro.model_dump(exclude_none=True, exclude={"name"})
                 for pro in self.required_properties + self.optional_properties
             },
             "required": [pro.name for pro in self.required_properties],
diff --git a/src/canopy/llm/openai.py b/src/canopy/llm/openai.py
index 3e73248d..f89ccd8a 100644
--- a/src/canopy/llm/openai.py
+++ b/src/canopy/llm/openai.py
@@ -5,7 +5,9 @@
 import openai
 import json
 
-from openai.types.chat import ChatCompletionToolParam
+from openai import Stream
+from openai.types.chat import (ChatCompletionToolParam, ChatCompletionChunk,
+                               ChatCompletion)
 from tenacity import (
     retry,
     stop_after_attempt,
@@ -121,8 +123,8 @@ def chat_completion(self,
             system_message = system_prompt
         else:
             system_message = system_prompt + f"\nContext: {context.to_text()}"
-        messages = [SystemMessage(content=system_message).dict()
-                    ] + [m.dict() for m in chat_history]
+        messages = [SystemMessage(content=system_message).model_dump()
+                    ] + [m.model_dump() for m in chat_history]
         try:
             response = self._client.chat.completions.create(model=model,
                                                             messages=messages,
@@ -131,14 +133,14 @@ def chat_completion(self,
         except openai.OpenAIError as e:
             self._handle_chat_error(e)
 
-        def streaming_iterator(response):
-            for chunk in response:
-                yield StreamingChatChunk.parse_obj(chunk)
+        def streaming_iterator(chunks: Stream[ChatCompletionChunk]):
+            for chunk in chunks:
+                yield StreamingChatChunk.model_validate(chunk.model_dump())
 
         if stream:
-            return streaming_iterator(response)
+            return streaming_iterator(cast(Stream[ChatCompletionChunk], response))
 
-        return ChatResponse.parse_obj(response)
+        return ChatResponse.model_validate(cast(ChatCompletion, response).model_dump())
 
     @retry(
         reraise=True,
@@ -206,10 +208,10 @@ def enforced_function_call(self,
         model = model_params_dict.pop("model", self.model_name)
 
         function_dict = cast(ChatCompletionToolParam,
-                             {"type": "function", "function": function.dict()})
+                             {"type": "function", "function": function.model_dump()})
 
-        messages = [SystemMessage(content=system_prompt).dict()
-                    ] + [m.dict() for m in chat_history]
+        messages = [SystemMessage(content=system_prompt).model_dump()
+                    ] + [m.model_dump() for m in chat_history]
         try:
             chat_completion = self._client.chat.completions.create(
                 model=model,
@@ -226,7 +228,7 @@ def enforced_function_call(self,
         result = chat_completion.choices[0].message.tool_calls[0].function.arguments
         arguments = json.loads(result)
 
-        jsonschema.validate(instance=arguments, schema=function.parameters.dict())
+        jsonschema.validate(instance=arguments, schema=function.parameters.model_dump())
         return arguments
 
     async def achat_completion(self,
diff --git a/src/canopy/models/api_models.py b/src/canopy/models/api_models.py
index 53a93585..dbdc82f7 100644
--- a/src/canopy/models/api_models.py
+++ b/src/canopy/models/api_models.py
@@ -1,6 +1,6 @@
 from typing import Optional, Sequence, Iterable
 
-from pydantic import BaseModel, Field, validator
+from pydantic import BaseModel, Field
 
 from canopy.models.data_models import MessageBase
 
@@ -20,11 +20,7 @@ class _StreamChoice(BaseModel):
 class TokenCounts(BaseModel):
     prompt_tokens: int
     completion_tokens: int
-    total_tokens: Optional[int] = None
-
-    @validator("total_tokens", always=True)
-    def calc_total_tokens(cls, v, values, **kwargs):
-        return values["prompt_tokens"] + values["completion_tokens"]
+    total_tokens: int
 
 
 class ChatResponse(BaseModel):
diff --git a/src/canopy/models/data_models.py b/src/canopy/models/data_models.py
index dac2ffc1..4edfcb0b 100644
--- a/src/canopy/models/data_models.py
+++ b/src/canopy/models/data_models.py
@@ -2,11 +2,16 @@
 from enum import Enum
 from typing import Optional, List, Union, Dict, Literal
 
-from pydantic import BaseModel, Field, validator, Extra
+from pydantic import field_validator, ConfigDict, BaseModel, Field, RootModel
+from typing_extensions import TypedDict
 
 Metadata = Dict[str, Union[str, int, float, List[str]]]
 
 
+class SparseVector(TypedDict):
+    indices: List[int]
+    values: List[float]
+
 # ----------------- Context Engine models -----------------
 
 
@@ -37,11 +42,10 @@ class Document(BaseModel):
         default_factory=dict,
         description="The document metadata. To learn more about metadata, see https://docs.pinecone.io/docs/manage-data",  # noqa: E501
     )
+    model_config = ConfigDict(extra="forbid", coerce_numbers_to_str=True)
 
-    class Config:
-        extra = Extra.forbid
-
-    @validator("metadata")
+    @field_validator("metadata")
+    @classmethod
     def metadata_reseved_fields(cls, v):
         if "text" in v:
             raise ValueError('Metadata cannot contain reserved field "text"')
@@ -52,7 +56,7 @@ def metadata_reseved_fields(cls, v):
         return v
 
 
-class ContextContent(BaseModel, ABC):
+class ContextContent(RootModel, ABC):
     # Any context should be able to be represented as well formatted text.
     # In the most minimal case, that could simply be a call to `.json()`.
     @abstractmethod
@@ -64,10 +68,10 @@ def __str__(self):
 
 
 class StringContextContent(ContextContent):
-    __root__: str
+    root: str
 
     def to_text(self, **kwargs) -> str:
-        return self.__root__
+        return self.root
 
 
 class Context(BaseModel):
@@ -93,8 +97,8 @@ class MessageBase(BaseModel):
                                    "Can be one of ['User', 'Assistant', 'System']")
     content: str = Field(description="The contents of the message.")
 
-    def dict(self, *args, **kwargs):
-        d = super().dict(*args, **kwargs)
+    def model_dump(self, *args, **kwargs):
+        d = super().model_dump(*args, **kwargs)
         d["role"] = d["role"].value
         return d
 
diff --git a/src/canopy/tokenizer/__init__.py b/src/canopy/tokenizer/__init__.py
index 9c699052..18d9006c 100644
--- a/src/canopy/tokenizer/__init__.py
+++ b/src/canopy/tokenizer/__init__.py
@@ -1,3 +1,4 @@
-from .openai import OpenAITokenizer
+from .cohere import CohereAPITokenizer, CohereHFTokenizer
 from .llama import LlamaTokenizer
+from .openai import OpenAITokenizer
 from .tokenizer import Tokenizer
diff --git a/src/canopy/tokenizer/cohere.py b/src/canopy/tokenizer/cohere.py
new file mode 100644
index 00000000..0f00540c
--- /dev/null
+++ b/src/canopy/tokenizer/cohere.py
@@ -0,0 +1,197 @@
+from typing import List, Optional
+
+from tokenizers import Tokenizer as HfTokenizer
+try:
+    import cohere
+except (OSError, ImportError, ModuleNotFoundError):
+    _cohere_installed = False
+else:
+    _cohere_installed = True
+
+from .base import BaseTokenizer
+from ..models.data_models import Messages
+
+
+class CohereHFTokenizer(BaseTokenizer):
+    """
+    Tokenizer for Cohere models, based on the Hugging Face tokenizers library.
+
+    Usage:
+    Initialize the singleton tokenizer with the LlamaTokenizer class:
+    >>> from canopy.tokenizer import Tokenizer
+    >>> Tokenizer.initialize(tokenizer_class=CohereHFTokenizer,
+                             model_name="Cohere/Command-nightly")
+
+    You can then use the tokenizer instance from anywhere in the code:
+    >>> from canopy.tokenizer import Tokenizer
+    >>> tokenizer = Tokenizer()
+    >>> tokenizer.tokenize("Hello World!")
+    ['▁Hello', '▁World', '!']
+    """
+
+    MESSAGE_TOKENS_OVERHEAD = 3
+    FIXED_PREFIX_TOKENS = 3
+
+    def __init__(
+        self,
+        model_name: str = "Cohere/Command-nightly",
+    ):
+        """
+        Initialize the tokenizer.
+
+        Args:
+            model_name: The name of the Hugging Face model to use. Defaults to "Cohere/Command-nightly".
+        """  # noqa: E501
+        if not _cohere_installed:
+            raise ImportError(
+                "Failed to import cohere. Make sure you install cohere extra "
+                "dependencies by running: "
+                "pip install canopy-sdk[cohere]"
+            )
+
+        self._encoder = HfTokenizer.from_pretrained(model_name)
+
+    def tokenize(self, text: str) -> List[str]:
+        """
+        Tokenize a text using HuggingFace's tokenizers library.
+
+        Args:
+            text: The text to tokenize.
+
+        Returns:
+            The list of tokens.
+        """
+        return self._encoder.encode(text, add_special_tokens=False).tokens
+
+    def detokenize(self, tokens: List[str]) -> str:
+        """
+        Detokenize a list of tokens that were previously tokenized using this tokenizer.
+
+        Args:
+            tokens: The list of tokens to detokenize.
+
+        Returns:
+            The detokenized text as a string.
+        """
+        if not isinstance(tokens, List):
+            raise TypeError(f"detokenize expect List[str], got f{type(tokens)}")
+
+        ids = [self._encoder.token_to_id(token) for token in tokens]
+        return self._encoder.decode(ids)
+
+    def token_count(self, text: str) -> int:
+        """
+        Count the number of tokens in a text.
+
+        Args:
+            text: The text to count the tokens of.
+
+        Returns:
+            The number of tokens in the text.
+        """
+        return len(self._encoder.encode(text, add_special_tokens=False).ids)
+
+    def messages_token_count(self, messages: Messages) -> int:
+        """
+        Count the number of tokens in a list of messages, as expected to be
+        counted by Cohere models.
+
+        Args:
+            messages: The list of messages to count the tokens of.
+
+        Returns:
+            The number of tokens in the messages, as expected to be counted by Cohere models.
+        """  # noqa: E501
+        num_tokens = 0
+        for message in messages:
+            num_tokens += self.MESSAGE_TOKENS_OVERHEAD
+            for key, value in message.model_dump().items():
+                num_tokens += self.token_count(value)
+        num_tokens += self.FIXED_PREFIX_TOKENS
+        return num_tokens
+
+
+class CohereAPITokenizer(BaseTokenizer):
+    """
+    Tokenizer for Cohere models, based on the Cohere Tokenize API.
+
+    Usage:
+    Initialize the singleton tokenizer with the CohereAPITokenizer class:
+    >>> from canopy.tokenizer import Tokenizer
+    >>> Tokenizer.initialize(tokenizer_class=CohereAPITokenizer, model_name="embed-multilingual-v3.0")
+
+    You can then use the tokenizer instance from anywhere in the code:
+    >>> from canopy.tokenizer import Tokenizer
+    >>> tokenizer = Tokenizer()
+    >>> tokenizer.tokenize("Hello world!")
+    ['Hello', ' world', '!']
+    """  # noqa: E501
+
+    MESSAGE_TOKENS_OVERHEAD = 3
+    FIXED_PREFIX_TOKENS = 3
+
+    def __init__(self,
+                 model_name: Optional[str] = None,
+                 *,
+                 api_key: Optional[str] = None,
+                 api_url: Optional[str] = None):
+        """
+        Initialize the tokenizer.
+
+        Args:
+            model_name: The name of the model to use.
+            api_key: Your Cohere API key. Defaults to None (uses the "CO_API_KEY" environment variable).
+            api_url: The base URL to use for the Cohere API. Defaults to None (uses the "CO_API_URL" environment variable if set, otherwise use default Cohere API URL).
+        """  # noqa: E501
+        self.model_name = model_name
+        self._client = cohere.Client(api_key, api_url=api_url)
+
+    def tokenize(self, text: str) -> List[str]:
+        """
+        Tokenize a text using Cohere Tokenize API.
+
+        Args:
+            text: The text to tokenize.
+
+        Returns:
+            The list of tokens.
+        """
+        if not text:
+            return []
+
+        tokens = self._client.tokenize(text, model=self.model_name)
+        return tokens.token_strings
+
+    def detokenize(self, tokens: List[str]) -> str:
+        """
+        Detokenize a list of tokens that were previously tokenized using this tokenizer.
+
+        Args:
+            tokens: The list of tokens to detokenize.
+
+        Returns:
+            The detokenized text as a string.
+        """
+        if not isinstance(tokens, List):
+            raise TypeError(f"detokenize expects List[str], got f{type(tokens)}")
+        return "".join(tokens)
+
+    def messages_token_count(self, messages: Messages) -> int:
+        """
+        Count the number of tokens in a list of messages as expected to be counted by Cohere models.
+        Account for the overhead of the messages structure.
+        Taken from: https://github.com/openai/openai-cookbook/.../How_to_format_inputs_to_ChatGPT_models.ipynb
+
+        Args:
+            messages: The list of messages to count the tokens of.
+
+        Returns:
+            The number of tokens in the messages, as expected to be counted by OpenAI models.
+        """  # noqa: E501
+        num_tokens = 0
+        for message in messages:
+            num_tokens += self.MESSAGE_TOKENS_OVERHEAD
+            for key, value in message.model_dump().items():
+                num_tokens += self.token_count(value)
+        num_tokens += self.FIXED_PREFIX_TOKENS
+        return num_tokens
diff --git a/src/canopy/tokenizer/llama.py b/src/canopy/tokenizer/llama.py
index b7384de9..4c4ddc84 100644
--- a/src/canopy/tokenizer/llama.py
+++ b/src/canopy/tokenizer/llama.py
@@ -2,7 +2,13 @@
 from .base import BaseTokenizer
 from ..models.data_models import Messages
 import os
-from transformers import LlamaTokenizerFast as HfTokenizer
+
+try:
+    from transformers import LlamaTokenizerFast as HfTokenizer
+except (OSError, ImportError, ModuleNotFoundError):
+    _tranformers_installed = False
+else:
+    _tranformers_installed = True
 
 
 class LlamaTokenizer(BaseTokenizer):
@@ -37,6 +43,13 @@ def __init__(
             model_name: The name of the model to use. Defaults to "openlm-research/open_llama_7b_v2".
             hf_token: Huggingface token
         """  # noqa: E501
+        if not _tranformers_installed:
+            raise ImportError(
+                "The transformers library is required to use the LlamaTokenizer. "
+                "Please install canopy with the [transformers] extra: "
+                "pip install canopy-sdk[transformers]"
+            )
+
         hf_token = hf_token or os.environ.get("HUGGINGFACE_TOKEN", "")
         # Add legacy=True to avoid extra printings
         self._encoder = HfTokenizer.from_pretrained(
@@ -98,7 +111,7 @@ def messages_token_count(self, messages: Messages) -> int:
         num_tokens = 0
         for message in messages:
             num_tokens += self.MESSAGE_TOKENS_OVERHEAD
-            for key, value in message.dict().items():
+            for key, value in message.model_dump().items():
                 num_tokens += self.token_count(value)
         num_tokens += self.FIXED_PREFIX_TOKENS
         return num_tokens
diff --git a/src/canopy/tokenizer/openai.py b/src/canopy/tokenizer/openai.py
index 2c00256a..fc34a8d9 100644
--- a/src/canopy/tokenizer/openai.py
+++ b/src/canopy/tokenizer/openai.py
@@ -91,7 +91,7 @@ def messages_token_count(self, messages: Messages) -> int:
         num_tokens = 0
         for message in messages:
             num_tokens += self.MESSAGE_TOKENS_OVERHEAD
-            for key, value in message.dict().items():
+            for key, value in message.model_dump().items():
                 num_tokens += self.token_count(value)
         num_tokens += self.FIXED_PREFIX_TOKENS
         return num_tokens
diff --git a/src/canopy/utils/debugging.py b/src/canopy/utils/debugging.py
new file mode 100644
index 00000000..2537de4d
--- /dev/null
+++ b/src/canopy/utils/debugging.py
@@ -0,0 +1,3 @@
+import os
+
+CANOPY_DEBUG_INFO = os.getenv("CANOPY_DEBUG_INFO", "FALSE").lower() == "true"
diff --git a/src/canopy/utils/directory.py b/src/canopy/utils/directory.py
new file mode 100644
index 00000000..64a9a6ea
--- /dev/null
+++ b/src/canopy/utils/directory.py
@@ -0,0 +1,8 @@
+from pathlib import Path
+
+
+class Directory:
+    """Stores the directory paths for Canopy library"""
+
+    ROOT = Path(__file__).parent.parent
+    CONFIG_TEMPLATES = ROOT.joinpath("config_templates")
diff --git a/src/canopy_cli/cli.py b/src/canopy_cli/cli.py
index e009cce6..c2564da2 100644
--- a/src/canopy_cli/cli.py
+++ b/src/canopy_cli/cli.py
@@ -1,12 +1,11 @@
 import os
-import signal
-import subprocess
+import shutil
 from typing import Dict, Any, Optional, List, Iterable
 
 import click
 from prompt_toolkit import prompt
 import time
-
+from pathlib import Path
 import requests
 import yaml
 from dotenv import load_dotenv
@@ -23,6 +22,7 @@
 from canopy.chat_engine import ChatEngine
 from canopy.models.data_models import Document, UserMessage
 from canopy.tokenizer import Tokenizer
+from canopy.utils.directory import Directory
 from canopy_cli.data_loader import (
     load_from_path,
     IDsNotUniqueError,
@@ -145,11 +145,12 @@ def _load_kb_config(config_file: Optional[str]) -> Dict[str, Any]:
 
 def _validate_chat_engine(config_file: Optional[str]):
     config = _read_config_file(config_file)
-    Tokenizer.initialize()
+    tokenizer_config = config.get("tokenizer", {})
     try:
         # If the server itself will fail, we can't except the error, since it's running
         # in a different process. Try to load and run the ChatEngine so we can catch
         # any errors and print a nice message.
+        Tokenizer.initialize_from_config(tokenizer_config)
         chat_engine = ChatEngine.from_config(config.get("chat_engine", {}))
         chat_engine.max_generated_tokens = 5
         chat_engine.context_engine.knowledge_base.connect()
@@ -179,7 +180,7 @@ def __init__(self, name=None, commands=None, **attrs):
             "health": 4,
             "stop": 5,
             "api-docs": 6,
-
+            "create-config": 7,
         }
 
     def list_commands(self, ctx):
@@ -211,6 +212,29 @@ def health(url):
     return
 
 
+@cli.command(help="Writes the config templates to a directory.")
+@click.argument("out_path", type=click.Path(), required=True)
+def create_config(out_path):
+
+    out_path = Path(out_path)
+
+    if out_path.is_file():
+        raise CLIError(f"Path expected to be a directory,"
+                       f"but found a file at {out_path}")
+
+    if out_path.exists() and any(out_path.iterdir()):
+        click.confirm(click.style(f"Path {out_path} is not empty. Overwrite?",
+                                  fg="red"),
+                      abort=True)
+
+    try:
+        shutil.copytree(Directory.CONFIG_TEMPLATES, out_path, dirs_exist_ok=True)
+    except Exception as e:
+        raise CLIError(f"Failed to write config template to {out_path}. Reason:\n{e}")
+
+    click.echo(click.style(f"Config templates written to {out_path}", fg="green"))
+
+
 @cli.command(
     help=(
         """
@@ -374,7 +398,7 @@ def upsert(index_name: str,
             )
             raise CLIError(msg)
         pd.options.display.max_colwidth = 20
-    click.echo(pd.DataFrame([doc.dict(exclude_none=True) for doc in data[:5]]))
+    click.echo(pd.DataFrame([doc.model_dump(exclude_none=True) for doc in data[:5]]))
     click.echo(click.style(f"\nTotal records: {len(data)}"))
     click.confirm(click.style("\nDoes this data look right?", fg="red"),
                   abort=True)
@@ -666,55 +690,6 @@ def start(host: str, port: str, reload: bool, stream: bool,
     start_server(host, port=port, reload=reload, config_file=config)
 
 
-@cli.command(
-    help=(
-        """
-        \b
-        Stop the Canopy server.
-
-        This command sends a shutdown request to the Canopy server.
-        """
-    )
-)
-@click.option("url", "--url", default=DEFAULT_SERVER_URL,
-              help=("URL of the Canopy server to use. "
-                    f"Defaults to {DEFAULT_SERVER_URL}"))
-def stop(url):
-    if os.name != "nt":
-        # Check if the server was started using Gunicorn
-        res = subprocess.run(["pgrep", "-f", "gunicorn canopy_server.app:app"],
-                             capture_output=True)
-        output = res.stdout.decode("utf-8").split()
-
-        # If Gunicorn was used, kill all Gunicorn processes
-        if output:
-            msg = ("It seems that Canopy server was launched using Gunicorn.\n"
-                   "Do you want to kill all Gunicorn processes?")
-            click.confirm(click.style(msg, fg="red"), abort=True)
-            try:
-                subprocess.run(["pkill", "-f", "gunicorn canopy_server.app:app"],
-                               check=True)
-            except subprocess.CalledProcessError:
-                try:
-                    [os.kill(int(pid), signal.SIGINT) for pid in output]
-                except OSError:
-                    msg = (
-                        "Could not kill Gunicorn processes. Please kill them manually."
-                        f"Found process ids: {output}"
-                    )
-                    raise CLIError(msg)
-
-    try:
-        res = requests.get(urljoin(url, "shutdown"))
-        res.raise_for_status()
-        return res.ok
-    except requests.exceptions.ConnectionError:
-        msg = f"""
-        Could not find Canopy server on {url}.
-        """
-        raise CLIError(msg)
-
-
 @cli.command(
     help=(
         """
diff --git a/src/canopy_server/app.py b/src/canopy_server/app.py
index f4572857..69c048d0 100644
--- a/src/canopy_server/app.py
+++ b/src/canopy_server/app.py
@@ -1,11 +1,10 @@
 import os
 import logging
-import signal
 import sys
 import uuid
+from contextlib import asynccontextmanager
 
 import openai
-from multiprocessing import current_process, parent_process
 
 import yaml
 from dotenv import load_dotenv
@@ -38,7 +37,6 @@
     ContextUpsertRequest,
     HealthStatus,
     ContextDeleteRequest,
-    ShutdownResponse,
     SuccessUpsertResponse,
     SuccessDeleteResponse,
     ContextResponse,
@@ -48,7 +46,6 @@
 from canopy_cli.errors import ConfigError
 from canopy import __version__
 
-
 APIChatResponse = Union[ChatResponse, EventSourceResponse]
 
 load_dotenv()  # load env vars before import of openai
@@ -69,6 +66,16 @@
 You can find your free trial OpenAI API key https://platform.openai.com/account/api-keys. You might need to log in or register for OpenAI services.
 """  # noqa: E501
 
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    _init_logging()
+    _init_engines()
+    _init_routes(app)
+    await health_check()
+    yield
+
+
 API_VERSION = "v1"
 
 # Global variables - Application
@@ -80,6 +87,7 @@
         "name": "Apache 2.0",
         "url": "https://www.apache.org/licenses/LICENSE-2.0.html",
     },
+    lifespan=lifespan
 )
 openai_api_router = APIRouter()
 context_api_router = APIRouter(prefix="/context")
@@ -101,8 +109,8 @@
     responses={500: {"description": "Failed to chat with Canopy"}},  # noqa: E501
 )
 async def chat(
-    request: ChatRequest = Body(...),
-    namespace: Optional[str] = None,
+        request: ChatRequest = Body(...),
+        namespace: Optional[str] = None,
 ) -> APIChatResponse:
     """
     Chat with Canopy, using the LLM and context engine, and return a response.
@@ -124,7 +132,7 @@ async def chat(
         session_id = request.user or "None"  # noqa: F841
         question_id = str(uuid.uuid4())
         logger.debug(f"Received chat request: {request.messages[-1].content}")
-        model_params = request.dict(exclude={"messages", "stream"})
+        model_params = request.model_dump(exclude={"messages", "stream"})
         answer = await run_in_threadpool(
             chat_engine.chat,
             messages=request.messages,
@@ -162,8 +170,8 @@ def stringify_content(response: StreamingChatResponse):
     },
 )
 async def query(
-    request: ContextQueryRequest = Body(...),
-    namespace: Optional[str] = None,
+        request: ContextQueryRequest = Body(...),
+        namespace: Optional[str] = None,
 ) -> ContextResponse:
     """
     Query the knowledge base for relevant context.
@@ -200,8 +208,8 @@ async def query(
     responses={500: {"description": "Failed to upsert documents"}},
 )
 async def upsert(
-    request: ContextUpsertRequest = Body(...),
-    namespace: str = ""
+        request: ContextUpsertRequest = Body(...),
+        namespace: str = ""
 ) -> SuccessUpsertResponse:
     """
     Upsert documents into the knowledge base. Upserting is a way to add new documents or update existing ones.
@@ -231,8 +239,8 @@ async def upsert(
     responses={500: {"description": "Failed to delete documents"}},
 )
 async def delete(
-    request: ContextDeleteRequest = Body(...),
-    namespace: Optional[str] = None,
+        request: ContextDeleteRequest = Body(...),
+        namespace: Optional[str] = None,
 ) -> SuccessDeleteResponse:
     """
     Delete documents from the knowledgebase. Deleting documents is done by their unique ID.
@@ -284,41 +292,6 @@ async def health_check() -> HealthStatus:
     return HealthStatus(pinecone_status="OK", llm_status="OK")
 
 
-@application_router.get("/shutdown")
-async def shutdown() -> ShutdownResponse:
-    """
-    __WARNING__: Experimental method.
-
-
-    This method will shutdown the server. It is used for testing purposes, and not recommended to be used
-    in production.
-    This method will locate the parent process and send a SIGINT signal to it.
-    """  # noqa: E501
-    logger.info("Shutting down")
-    proc = current_process()
-    p_process = parent_process()
-    pid = p_process.pid if p_process is not None else proc.pid
-    if not pid:
-        raise HTTPException(
-            status_code=500,
-            detail="Failed to locate parent process. Cannot shutdown server.",
-        )
-    if sys.platform == 'win32':
-        kill_signal = signal.CTRL_C_EVENT
-    else:
-        kill_signal = signal.SIGINT
-    os.kill(pid, kill_signal)
-    return ShutdownResponse()
-
-
-@app.on_event("startup")
-async def startup():
-    _init_logging()
-    _init_engines()
-    _init_routes(app)
-    await health_check()
-
-
 def _init_routes(app):
     # Include the API version in the path, API_VERSION should be the latest version.
     app.include_router(application_router, prefix=f"/{API_VERSION}")
@@ -341,7 +314,7 @@ def _init_logging():
     handlers = [file_handler, stdout_handler]
     logging.basicConfig(
         format="%(asctime)s - %(processName)s - %(name)-10s [%(levelname)-8s]:  "
-        "%(message)s",
+               "%(message)s",
         level=os.getenv("CE_LOG_LEVEL", "INFO").upper(),
         handlers=handlers,
         force=True,
diff --git a/src/canopy_server/models/v1/api_models.py b/src/canopy_server/models/v1/api_models.py
index 197babf4..db1080f6 100644
--- a/src/canopy_server/models/v1/api_models.py
+++ b/src/canopy_server/models/v1/api_models.py
@@ -1,6 +1,6 @@
 from typing import Dict, List, Optional, Union
 
-from pydantic import BaseModel, Field
+from pydantic import ConfigDict, BaseModel, Field
 
 from canopy.models.data_models import Messages, Query, Document
 
@@ -70,9 +70,7 @@ class ChatRequest(BaseModel):
         default=None,
         description="A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse. Unused, reserved for future extensions",  # noqa: E501
     )
-
-    class Config:
-        extra = "ignore"
+    model_config = ConfigDict(extra="ignore")
 
 
 class ContextQueryRequest(BaseModel):
@@ -114,13 +112,6 @@ def to_text(
         return self.json()
 
 
-class ShutdownResponse(BaseModel):
-    message: str = Field(
-        default="Shutting down",
-        description="Message indicating the server is shutting down.",
-    )
-
-
 class SuccessUpsertResponse(BaseModel):
     message: str = Field(
         default="Success",
diff --git a/tests/e2e/test_app.py b/tests/e2e/test_app.py
index 6ffc39e9..4fab8312 100644
--- a/tests/e2e/test_app.py
+++ b/tests/e2e/test_app.py
@@ -104,7 +104,7 @@ def test_health(client):
     assert health_response.is_success
     assert (
             health_response.json()
-            == HealthStatus(pinecone_status="OK", llm_status="OK").dict()
+            == HealthStatus(pinecone_status="OK", llm_status="OK").model_dump()
     )
 
 
@@ -112,7 +112,7 @@ def test_upsert(client, namespace_prefix):
     # Upsert a document to the index
     upsert_response = client.post(
         f"{namespace_prefix}context/upsert",
-        json=upsert_payload.dict())
+        json=upsert_payload.model_dump())
     assert upsert_response.is_success
 
 
@@ -133,7 +133,7 @@ def test_query(client, namespace_prefix):
 
     query_response = client.post(
         f"{namespace_prefix}context/query",
-        json=query_payload.dict())
+        json=query_payload.model_dump())
     assert query_response.is_success
 
     query_response = query_response.json()
@@ -143,12 +143,12 @@ def test_query(client, namespace_prefix):
     stuffing_content = json.loads(query_response["content"])
     assert (
             stuffing_content[0]["query"]
-            == query_payload.dict()["queries"][0]["text"]
+            == query_payload.model_dump()["queries"][0]["text"]
             and stuffing_content[0]["snippets"][0]["text"]
-            == upsert_payload.dict()["documents"][0]["text"]
+            == upsert_payload.model_dump()["documents"][0]["text"]
     )
     assert (stuffing_content[0]["snippets"][0]["source"] ==
-            upsert_payload.dict()["documents"][0]["source"])
+            upsert_payload.model_dump()["documents"][0]["source"])
 
 
 def test_chat_required_params(client, namespace_prefix):
diff --git a/tests/system/knowledge_base/test_knowledge_base.py b/tests/system/knowledge_base/test_knowledge_base.py
index 56a85da2..4946af05 100644
--- a/tests/system/knowledge_base/test_knowledge_base.py
+++ b/tests/system/knowledge_base/test_knowledge_base.py
@@ -3,7 +3,12 @@
 
 import pytest
 import numpy as np
-from pinecone import Index, Pinecone
+try:
+    from pinecone.grpc import PineconeGRPC as Pinecone
+    from pinecone.grpc import GRPCIndex as Index
+except ImportError:
+    from pinecone import Pinecone, Index
+
 from tenacity import (
     retry,
     stop_after_delay,
diff --git a/tests/system/llm/test_cohere.py b/tests/system/llm/test_cohere.py
new file mode 100644
index 00000000..f638d391
--- /dev/null
+++ b/tests/system/llm/test_cohere.py
@@ -0,0 +1,300 @@
+from unittest.mock import MagicMock
+
+import pytest
+from cohere.error import CohereAPIError
+
+from canopy.models.data_models import Context, ContextContent, Role, MessageBase
+from canopy.context_engine.context_builder.stuffing import (
+    StuffingContextContent, ContextQueryResult, ContextSnippet
+)
+from canopy.models.api_models import ChatResponse, StreamingChatChunk
+from canopy.llm.cohere import CohereLLM
+
+
+def assert_chat_completion(response):
+    assert len(response.choices) == 1  # Cohere API does not return multiple choices.
+
+    assert isinstance(response.choices[0].message, MessageBase)
+    assert isinstance(response.choices[0].message.content, str)
+    assert len(response.choices[0].message.content) > 0
+    assert isinstance(response.choices[0].message.role, Role)
+
+
+@pytest.fixture
+def model_name():
+    return "command"
+
+
+@pytest.fixture
+def system_prompt():
+    return "Use only the provided documents to answer."
+
+
+@pytest.fixture
+def expected_chat_kwargs(system_prompt):
+    return {
+        "model": "command",
+        "message": "Just checking in. Be concise.",
+        "chat_history": [
+            {"role": "USER", "message": "Use only the provided documents to answer."},
+            {"role": "CHATBOT", "message": "Ok."},
+            {'role': 'USER', 'message': 'Hello, assistant.'},
+            {"role": "CHATBOT", "message": "Hello, user. How can I assist you?"}
+        ],
+        "connectors": None,
+        "documents": [],
+        "preamble_override": None,
+        "stream": False,
+        "max_tokens": None,
+    }
+
+
+@pytest.fixture
+def model_params_high_temperature():
+    return {"temperature": 0.9}
+
+
+@pytest.fixture
+def model_params_low_temperature():
+    return {"temperature": 0.2}
+
+
+@pytest.fixture
+def cohere_llm():
+    return CohereLLM()
+
+
+@pytest.fixture
+def unsupported_context():
+    class UnsupportedContextContent(ContextContent):
+        def to_text(self, **kwargs):
+            return ''
+
+    return Context(content=UnsupportedContextContent(), num_tokens=123)
+
+
+def test_init_with_custom_params():
+    llm = CohereLLM(model_name="test_model_name",
+                    api_key="test_api_key",
+                    temperature=0.9)
+
+    assert llm.model_name == "test_model_name"
+    assert llm.default_model_params["temperature"] == 0.9
+    assert llm._client.api_key == "test_api_key"
+
+
+def test_chat_completion(cohere_llm, messages, system_prompt, expected_chat_kwargs):
+    cohere_llm._client = MagicMock(wraps=cohere_llm._client)
+    response = cohere_llm.chat_completion(
+        chat_history=messages, system_prompt=system_prompt)
+    cohere_llm._client.chat.assert_called_once_with(**expected_chat_kwargs)
+    assert_chat_completion(response)
+
+
+def test_chat_completion_high_temperature(cohere_llm,
+                                          messages,
+                                          model_params_high_temperature):
+    response = cohere_llm.chat_completion(
+        chat_history=messages,
+        model_params=model_params_high_temperature,
+        system_prompt='',
+    )
+    assert_chat_completion(response)
+
+
+def test_chat_completion_low_temperature(cohere_llm,
+                                         messages,
+                                         model_params_low_temperature):
+    response = cohere_llm.chat_completion(chat_history=messages,
+                                          model_params=model_params_low_temperature,
+                                          system_prompt='')
+    assert_chat_completion(response)
+
+
+def test_chat_completion_without_system_prompt(cohere_llm,
+                                               messages,
+                                               expected_chat_kwargs):
+    expected_chat_kwargs["chat_history"] = expected_chat_kwargs["chat_history"][2:]
+    cohere_llm._client = MagicMock(wraps=cohere_llm._client)
+    response = cohere_llm.chat_completion(
+        chat_history=messages, system_prompt="")
+    cohere_llm._client.chat.assert_called_once_with(**expected_chat_kwargs)
+    assert_chat_completion(response)
+
+
+def test_chat_streaming(cohere_llm, messages):
+    stream = True
+    response = cohere_llm.chat_completion(chat_history=messages,
+                                          stream=stream,
+                                          system_prompt='')
+    messages_received = [message for message in response]
+    assert len(messages_received) > 0
+
+    for message in messages_received:
+        assert isinstance(message, StreamingChatChunk)
+        assert message.object == "chat.completion.chunk"
+
+
+def test_max_tokens(cohere_llm, messages):
+    max_tokens = 2
+    response = cohere_llm.chat_completion(chat_history=messages,
+                                          max_tokens=max_tokens,
+                                          system_prompt='')
+    assert isinstance(response, ChatResponse)
+    assert len(response.choices[0].message.content.split()) <= max_tokens
+
+
+def test_missing_messages(cohere_llm):
+    with pytest.raises(RuntimeError, match="No message provided"):
+        cohere_llm.chat_completion(chat_history=[], system_prompt='')
+
+
+def test_negative_max_tokens(cohere_llm, messages):
+    with pytest.raises(RuntimeError, match="max_tokens cannot be less than 0"):
+        cohere_llm.chat_completion(
+            chat_history=messages, max_tokens=-5, system_prompt='')
+
+
+def test_chat_completion_api_failure_propagates(cohere_llm,
+                                                messages):
+    cohere_llm._client = MagicMock()
+    cohere_llm._client.chat.side_effect = CohereAPIError("API call failed")
+
+    with pytest.raises(RuntimeError, match="API call failed"):
+        cohere_llm.chat_completion(chat_history=messages, system_prompt="")
+
+
+def test_chat_completion_with_unsupported_context_engine(cohere_llm,
+                                                         messages,
+                                                         unsupported_context):
+    cohere_llm._client = MagicMock()
+
+    with pytest.raises(NotImplementedError):
+        cohere_llm.chat_completion(chat_history=messages,
+                                   system_prompt="",
+                                   context=unsupported_context)
+
+
+def test_chat_completion_with_unrecognized_param_raises_error(cohere_llm, messages):
+    with pytest.raises(NotImplementedError):
+        cohere_llm.chat_completion(chat_history=messages,
+                                   system_prompt="",
+                                   model_params={
+                                       "functions": {},
+                                   })
+
+
+def test_chat_completion_ignores_unrecognized_model_params_with_init_kwarg(messages):
+    cohere_llm = CohereLLM(ignore_unrecognized_params=True)
+    response = cohere_llm.chat_completion(chat_history=messages,
+                                          system_prompt="",
+                                          model_params={
+                                              "functions": {},
+                                          })
+    assert response.object == "chat.completion"
+
+
+def test_chat_completion_with_equivalent_model_params(cohere_llm,
+                                                      messages,
+                                                      system_prompt,
+                                                      expected_chat_kwargs):
+    cohere_llm._client = MagicMock(wraps=cohere_llm._client)
+    response = cohere_llm.chat_completion(
+        chat_history=messages,
+        system_prompt=system_prompt,
+        model_params={
+            "top_p": 0.9,
+            "user": "admin",
+        }
+    )
+    expected_chat_kwargs_with_equivalents = {
+        **expected_chat_kwargs,
+        "p": 0.9,
+        "user_name": "admin",
+    }
+    cohere_llm._client.chat.assert_called_once_with(
+        **expected_chat_kwargs_with_equivalents
+    )
+    assert response.object == "chat.completion"
+
+
+def test_chat_completion_with_stuffing_context_snippets(cohere_llm,
+                                                        messages,
+                                                        expected_chat_kwargs,
+                                                        system_prompt):
+    cohere_llm._client = MagicMock(wraps=cohere_llm._client)
+    content = StuffingContextContent([
+        ContextQueryResult(query="", snippets=[
+            ContextSnippet(
+                source="https://www.example.com/document",
+                text="Document text",
+            ),
+            ContextSnippet(
+                source="https://www.example.com/second_document",
+                text="Second document text",
+            )
+        ])
+    ])
+    stuffing_context = Context(
+        content=content,
+        num_tokens=123)
+
+    response = cohere_llm.chat_completion(
+        chat_history=messages,
+        system_prompt=system_prompt,
+        context=stuffing_context)
+
+    # Check that we got a valid chat response - details tested in other tests
+    assert isinstance(response, ChatResponse)
+    assert response.object == "chat.completion"
+
+    # Check that Cohere client was called with the snippets
+    expected_chat_kwargs["documents"] = [
+        {
+            "source": "https://www.example.com/document",
+            "text": "Document text",
+        },
+        {
+            "source": "https://www.example.com/second_document",
+            "text": "Second document text",
+        },
+    ]
+    cohere_llm._client.chat.assert_called_once_with(**expected_chat_kwargs)
+
+
+def test_token_counts_mapped_in_chat_response(cohere_llm, messages, system_prompt):
+    response = cohere_llm.chat_completion(chat_history=messages,
+                                          system_prompt=system_prompt)
+    assert response.usage.prompt_tokens == 107
+    assert response.usage.completion_tokens
+    assert response.usage.total_tokens
+
+
+def test_api_errors_caught_and_raised_as_runtime_errors(cohere_llm,
+                                                        messages,
+                                                        system_prompt):
+    expected_message = (
+        "Failed to use Cohere's unknown_model model for chat completion."
+        " Underlying Error:\n"
+        ".+"
+    )
+
+    with pytest.raises(RuntimeError, match=expected_message):
+        cohere_llm.chat_completion(chat_history=messages,
+                                   system_prompt=system_prompt,
+                                   model_params={
+                                       "model": "unknown_model",
+                                   })
+
+
+def test_bad_api_key(monkeypatch):
+    monkeypatch.setenv("CO_API_KEY", "")
+
+    expected_message = (
+        "Failed to connect to Cohere, please make sure that the CO_API_KEY"
+        " environment variable is set correctly.\n"
+        ".*API key"
+    )
+
+    with pytest.raises(RuntimeError, match=expected_message):
+        CohereLLM()
diff --git a/tests/system/llm/test_openai.py b/tests/system/llm/test_openai.py
index b0d0c21f..e6c334c8 100644
--- a/tests/system/llm/test_openai.py
+++ b/tests/system/llm/test_openai.py
@@ -111,7 +111,7 @@ def test_chat_completion_with_context(openai_llm, messages):
                                           chat_history=messages,
                                           context=Context(
                                               content=StringContextContent(
-                                                  __root__="context from kb"
+                                                  "context from kb"
                                               ),
                                               num_tokens=5
                                           ))
diff --git a/tests/system/query_generator/test_cohere_query_generator.py b/tests/system/query_generator/test_cohere_query_generator.py
new file mode 100644
index 00000000..163826fd
--- /dev/null
+++ b/tests/system/query_generator/test_cohere_query_generator.py
@@ -0,0 +1,30 @@
+import pytest
+
+from canopy.chat_engine.query_generator.cohere import CohereQueryGenerator
+from canopy.models.data_models import MessageBase, Role
+
+
+@pytest.fixture
+def messages():
+    return [
+        MessageBase(
+            role=Role.USER, content="Hello, assistant."),
+        MessageBase(
+            role=Role.ASSISTANT, content="Hello, user. How can I assist you?"),
+        MessageBase(
+            role=Role.USER, content="How do I init a pinecone client?.")
+    ]
+
+
+def test_generate_queries(messages):
+    query_generator = CohereQueryGenerator()
+    queries = query_generator.generate(messages, max_prompt_tokens=100)
+    assert queries
+    assert queries[0].text
+
+
+def test_max_tokens_exceeded_raises_error(messages):
+    query_generator = CohereQueryGenerator()
+
+    with pytest.raises(ValueError):
+        query_generator.generate(messages, max_prompt_tokens=10)
diff --git a/tests/system/record_encoder/test_sentence_transformers_encoder.py b/tests/system/record_encoder/test_sentence_transformers_encoder.py
new file mode 100644
index 00000000..10ed6f6c
--- /dev/null
+++ b/tests/system/record_encoder/test_sentence_transformers_encoder.py
@@ -0,0 +1,61 @@
+import pytest
+
+from canopy.knowledge_base.models import KBDocChunk
+from canopy.knowledge_base.record_encoder.sentence_transformers import (
+    SentenceTransformerRecordEncoder
+)
+from canopy.models.data_models import Query
+
+documents = [KBDocChunk(
+    id=f"doc_1_{i}",
+    text=f"Sample document {i}",
+    document_id=f"doc_{i}",
+    metadata={"test": i},
+    source="doc_1",
+)
+    for i in range(4)
+]
+
+queries = [Query(text="Sample query 1"),
+           Query(text="Sample query 2"),
+           Query(text="Sample query 3"),
+           Query(text="Sample query 4")]
+
+
+@pytest.fixture
+def encoder():
+    try:
+        encoder = SentenceTransformerRecordEncoder(batch_size=2)
+    except ImportError:
+        pytest.skip(
+            "`transformers` extra not installed. Skipping SentenceTransformer system "
+            "tests"
+        )
+    return encoder
+
+
+def test_dimension(encoder):
+    assert encoder.dimension == 384
+
+
+@pytest.mark.parametrize("items,function",
+                         [(documents, "encode_documents"),
+                          (queries, "encode_queries"),
+                          ([], "encode_documents"),
+                          ([], "encode_queries")])
+def test_encode_documents(encoder, items, function):
+
+    encoded_documents = getattr(encoder, function)(items)
+
+    assert len(encoded_documents) == len(items)
+    assert all(len(encoded.values) == encoder.dimension
+               for encoded in encoded_documents)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("items,function",
+                         [("aencode_documents", documents),
+                          ("aencode_queries", queries)])
+async def test_aencode_not_implemented(encoder, function, items):
+    with pytest.raises(NotImplementedError):
+        await encoder.aencode_queries(items)
diff --git a/tests/system/reranker/__init__.py b/tests/system/reranker/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/system/reranker/test_cohere_reranker.py b/tests/system/reranker/test_cohere_reranker.py
new file mode 100644
index 00000000..63453f8f
--- /dev/null
+++ b/tests/system/reranker/test_cohere_reranker.py
@@ -0,0 +1,71 @@
+import os
+
+import pytest
+
+from canopy.knowledge_base.models import KBQueryResult, KBDocChunkWithScore
+from canopy.knowledge_base.reranker import CohereReranker
+
+
+@pytest.fixture
+def should_run_test():
+    if os.getenv("CO_API_KEY") is None:
+        pytest.skip(
+            "Couldn't find Cohere API key. Skipping Cohere tests."
+        )
+
+
+@pytest.fixture
+def cohere_reranker(should_run_test):
+    return CohereReranker()
+
+
+@pytest.fixture
+def documents():
+    return [
+        KBDocChunkWithScore(
+            id=f"doc_1_{i}",
+            text=f"Sample chunk {i}",
+            document_id="doc_1",
+            source="doc_1",
+            score=0.1 * i
+        ) for i in range(4)
+    ]
+
+
+@pytest.fixture
+def query_result(documents):
+    return KBQueryResult(query="Sample query 1",
+                         documents=documents)
+
+
+def test_rerank_empty(cohere_reranker):
+    results = cohere_reranker.rerank([])
+    assert results == []
+
+
+def test_rerank(cohere_reranker, query_result, documents):
+    id_to_score = {d.id: d.score for d in query_result.documents}
+    ranked_result = next(iter(cohere_reranker.rerank([query_result])))
+    reranked_scores = [doc.score for doc in ranked_result.documents]
+
+    assert len(ranked_result.documents) == len(documents)
+    assert reranked_scores == sorted(reranked_scores, reverse=True)
+
+    # Make sure the scores are overriden by the reranker
+    for doc in ranked_result.documents:
+        assert doc.score != id_to_score[doc.id]
+
+
+def test_bad_api_key(should_run_test, query_result):
+    with pytest.raises(RuntimeError, match="invalid api token"):
+        CohereReranker(api_key="bad key").rerank([query_result])
+
+
+def test_model_name_invalid(should_run_test, query_result):
+    with pytest.raises(RuntimeError, match="model .* not found"):
+        CohereReranker(model_name="my-madeup-model").rerank([query_result])
+
+
+def test_top_n(should_run_test, query_result):
+    results = CohereReranker(top_n=1).rerank([query_result])
+    assert len(results[0].documents) == 1
diff --git a/tests/system/reranker/test_transparent_reranker.py b/tests/system/reranker/test_transparent_reranker.py
new file mode 100644
index 00000000..eb8c3e17
--- /dev/null
+++ b/tests/system/reranker/test_transparent_reranker.py
@@ -0,0 +1,27 @@
+import pytest
+
+from canopy.knowledge_base.models import KBDocChunkWithScore, KBQueryResult
+from canopy.knowledge_base.reranker import TransparentReranker
+
+
+@pytest.fixture
+def documents():
+    return [
+        KBDocChunkWithScore(
+            id=f"doc_1_{i}",
+            text=f"Sample chunk {i}",
+            document_id="doc_1",
+            source="doc_1",
+            score=0.1 * i
+        ) for i in range(1)
+    ]
+
+
+@pytest.fixture
+def query_result(documents):
+    return KBQueryResult(query="Sample query 1",
+                         documents=documents)
+
+
+def test_rerank(query_result):
+    assert TransparentReranker().rerank([query_result]) == [query_result]
diff --git a/tests/system/tokenizer/__init__.py b/tests/system/tokenizer/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/system/tokenizer/test_cohere_api_tokenizer.py b/tests/system/tokenizer/test_cohere_api_tokenizer.py
new file mode 100644
index 00000000..fcdb6576
--- /dev/null
+++ b/tests/system/tokenizer/test_cohere_api_tokenizer.py
@@ -0,0 +1,46 @@
+import os
+
+import pytest
+
+from canopy.models.data_models import MessageBase, Role
+from canopy.tokenizer import CohereAPITokenizer
+from ...unit.tokenizer.base_test_tokenizer import BaseTestTokenizer
+
+
+class TestCohereAPITokenizer(BaseTestTokenizer):
+    @staticmethod
+    @pytest.fixture(scope="class")
+    def tokenizer():
+        if not os.getenv("CO_API_KEY"):
+            pytest.skip("Skipping Cohere API tokenizer tests because "
+                        "COHERE_API_KEY environment variable is not set.")
+        return CohereAPITokenizer(model_name="command")
+
+    @staticmethod
+    @pytest.fixture
+    def text():
+        return "string with special characters like !@#$%^&*()_+日本 " \
+               "spaces   \n \n\n CASE cAse "
+
+    @staticmethod
+    @pytest.fixture
+    def expected_tokens(text):
+        return ['string', ' with', ' special', ' characters', ' like',
+                ' !', '@', '#', '$', '%', '^', '&', '*', '()', '_', '+', '日',
+                '本', ' spaces', '   ', '\n ', '\n\n', ' CASE', ' c', 'A',
+                'se', " "]
+
+    @staticmethod
+    def test_messages_token_count(tokenizer):
+        messages = [MessageBase(role=Role.USER, content="Hello, assistant.")]
+        assert tokenizer.messages_token_count(messages) == 11
+
+        messages = [MessageBase(role=Role.USER,
+                                content="Hello, assistant."),
+                    MessageBase(role=Role.ASSISTANT,
+                                content="Hello, user. How can I assist you?")]
+        assert tokenizer.messages_token_count(messages) == 25
+
+    @staticmethod
+    def test_messages_token_count_empty_messages(tokenizer):
+        assert tokenizer.messages_token_count([]) == 3
diff --git a/tests/system/utils/test_config.py b/tests/system/utils/test_config.py
index 0701f358..d9ee8789 100644
--- a/tests/system/utils/test_config.py
+++ b/tests/system/utils/test_config.py
@@ -6,8 +6,7 @@
 from canopy.chat_engine import ChatEngine
 from canopy.context_engine import ContextEngine
 from canopy.knowledge_base import KnowledgeBase
-
-DEFAULT_COFIG_PATH = 'config/config.yaml'
+from canopy.utils.directory import Directory
 
 
 @pytest.fixture(scope='module')
@@ -24,8 +23,10 @@ def temp_index_name():
 
 
 def test_default_config_matches_code_defaults(temp_index_name):
-    with open(DEFAULT_COFIG_PATH) as f:
-        default_config = yaml.safe_load(f)
+
+    with open(Directory.CONFIG_TEMPLATES.joinpath("default.yaml")) as file:
+        default_config = yaml.safe_load(file)
+
     chat_engine_config = default_config['chat_engine']
 
     loaded_chat_engine = ChatEngine.from_config(chat_engine_config)
diff --git a/tests/unit/chat_engine/test_chat_engine.py b/tests/unit/chat_engine/test_chat_engine.py
index 97628019..16923ee7 100644
--- a/tests/unit/chat_engine/test_chat_engine.py
+++ b/tests/unit/chat_engine/test_chat_engine.py
@@ -60,7 +60,7 @@ def _get_inputs_and_expected(self,
         mock_queries = [Query(text="How does photosynthesis work?")]
         mock_context = Context(
             content=StuffingContextContent(
-                __root__=[ContextQueryResult(
+                [ContextQueryResult(
                     query="How does photosynthesis work?",
 
                     snippets=[ContextSnippet(source="ref 1",
diff --git a/tests/unit/chunker/test_markdown_chunker.py b/tests/unit/chunker/test_markdown_chunker.py
index db9692b9..4025e5bc 100644
--- a/tests/unit/chunker/test_markdown_chunker.py
+++ b/tests/unit/chunker/test_markdown_chunker.py
@@ -127,7 +127,7 @@ def expected_chunks(documents):
                      '\ntext in level 3\n#### Level 4\ntext in level 4\n##### Level 5'
                      '\ntext in level 5\n###### Level 6\ntext in level 6',
                 source='doc_1',
-                metadata={'test': '1'},
+                metadata={'test': 1},
                 document_id='test_document_1'),
 
             KBDocChunk(
@@ -139,7 +139,7 @@ def expected_chunks(documents):
                      '~~Strikethrough text~~\n\n'
                      '## Another second level header\ntext after second level header',
                      source='doc_1',
-                     metadata={'test': '1'},
+                     metadata={'test': 1},
                      document_id='test_document_1'),
 
             KBDocChunk(
@@ -156,13 +156,13 @@ def expected_chunks(documents):
                      '\n\n## Blockquotes\n\n'
                      '> This is a blockquote.',
                 source='doc_1',
-                metadata={'test': '1'},
+                metadata={'test': 1},
                 document_id='test_document_1'),
 
             KBDocChunk(id='test_document_1_3',
                        text='## long text',
                        source='doc_1',
-                       metadata={'test': '1'},
+                       metadata={'test': 1},
                        document_id='test_document_1'),
 
             KBDocChunk(id='test_document_1_4',
@@ -176,7 +176,7 @@ def expected_chunks(documents):
                             'Inside, not gold, But memories and '
                             'tales. Of',
                        source='doc_1',
-                       metadata={'test': '1'},
+                       metadata={'test': 1},
                        document_id='test_document_1'),
 
             KBDocChunk(id='test_document_1_5',
@@ -185,7 +185,7 @@ def expected_chunks(documents):
                             'Of brave ancestors, And '
                             'magical whales.',
                        source='doc_1',
-                       metadata={'test': '1'},
+                       metadata={'test': 1},
                        document_id='test_document_1'),
 
             KBDocChunk(id='test_document_1_6',
@@ -195,7 +195,7 @@ def expected_chunks(documents):
                             "\nThe village united, "
                             "Bathed in tales' light.",
                        source='doc_1',
-                       metadata={'test': '1'},
+                       metadata={'test': 1},
                        document_id='test_document_1'),
 
             KBDocChunk(id='test_document_1_7',
@@ -206,13 +206,13 @@ def expected_chunks(documents):
                             "\n```\n## table"
                             "\na | b | c\n--- | --- | ---\n1 | 2 | 3",
                        source='doc_1',
-                       metadata={'test': '1'},
+                       metadata={'test': 1},
                        document_id='test_document_1'),
 
             KBDocChunk(id='test_document_3_0',
                        text='# short markdown\nmarkdown is short',
                        source='',
-                       metadata={'test': '2'},
+                       metadata={'test': 2},
                        document_id='test_document_3')
         ]
         return chunks
diff --git a/tests/unit/chunker/test_recursive_character_chunker.py b/tests/unit/chunker/test_recursive_character_chunker.py
index e3fdbd22..70f195e1 100644
--- a/tests/unit/chunker/test_recursive_character_chunker.py
+++ b/tests/unit/chunker/test_recursive_character_chunker.py
@@ -19,49 +19,49 @@ def expected_chunks(documents):
         return [
             KBDocChunk(id='test_document_1_0',
                        text='I am a',
-                       metadata={'test': '1'},
+                       metadata={'test': 1},
                        document_id='test_document_1'),
             KBDocChunk(id='test_document_1_1',
                        text='a simple test',
-                       metadata={'test': '1'},
+                       metadata={'test': 1},
                        document_id='test_document_1'),
             KBDocChunk(id='test_document_1_2',
                        text='test string to',
-                       metadata={'test': '1'},
+                       metadata={'test': 1},
                        document_id='test_document_1'),
             KBDocChunk(id='test_document_1_3',
                        text='to check the',
-                       metadata={'test': '1'},
+                       metadata={'test': 1},
                        document_id='test_document_1'),
             KBDocChunk(id='test_document_1_4',
                        text='the happy path',
-                       metadata={'test': '1'},
+                       metadata={'test': 1},
                        document_id='test_document_1'),
             KBDocChunk(id='test_document_1_5',
                        text='path of this',
-                       metadata={'test': '1'},
+                       metadata={'test': 1},
                        document_id='test_document_1'),
             KBDocChunk(id='test_document_1_6',
                        text='this simple chunker',
-                       metadata={'test': '1'},
+                       metadata={'test': 1},
                        document_id='test_document_1'),
             KBDocChunk(id='test_document_2_0',
                        text='another simple test',
-                       metadata={'test': '2'},
+                       metadata={'test': 2},
                        document_id='test_document_2',
                        source='doc_2'),
             KBDocChunk(id='test_document_2_1',
                        text='test string',
-                       metadata={'test': '2'},
+                       metadata={'test': 2},
                        document_id='test_document_2',
                        source='doc_2'),
             KBDocChunk(id='test_document_3_0',
                        text='sho',
-                       metadata={'test': '2'},
+                       metadata={'test': 2},
                        document_id='test_document_3',
                        source='doc_3'),
             KBDocChunk(id='test_document_3_1',
                        text='ort',
-                       metadata={'test': '2'},
+                       metadata={'test': 2},
                        document_id='test_document_3',
                        source='doc_3')]
diff --git a/tests/unit/chunker/test_token_chunker.py b/tests/unit/chunker/test_token_chunker.py
index b5550e3c..b6a1d739 100644
--- a/tests/unit/chunker/test_token_chunker.py
+++ b/tests/unit/chunker/test_token_chunker.py
@@ -19,32 +19,32 @@ def chunker():
     def expected_chunks(documents):
         return [KBDocChunk(id='test_document_1_0',
                            text='I am a simple test',
-                           metadata={'test': '1'},
+                           metadata={'test': 1},
                            document_id='test_document_1'),
                 KBDocChunk(id='test_document_1_1',
                            text='simple test string to check',
-                           metadata={'test': '1'},
+                           metadata={'test': 1},
                            document_id='test_document_1'),
                 KBDocChunk(id='test_document_1_2',
                            text='to check the happy path',
-                           metadata={'test': '1'},
+                           metadata={'test': 1},
                            document_id='test_document_1'),
                 KBDocChunk(id='test_document_1_3',
                            text='happy path of this simple',
-                           metadata={'test': '1'},
+                           metadata={'test': 1},
                            document_id='test_document_1'),
                 KBDocChunk(id='test_document_1_4',
                            text='this simple chunker',
-                           metadata={'test': '1'},
+                           metadata={'test': 1},
                            document_id='test_document_1',),
                 KBDocChunk(id='test_document_2_0',
                            text='another simple test string',
-                           metadata={'test': '2'},
+                           metadata={'test': 2},
                            document_id='test_document_2',
                            source='doc_2'),
                 KBDocChunk(id='test_document_3_0',
                            text='short',
-                           metadata={'test': '2'},
+                           metadata={'test': 2},
                            document_id='test_document_3',
                            source='doc_3'),
                 ]
@@ -59,11 +59,11 @@ def test_chunk_single_document_zero_overlap(chunker):
 
         expected = [KBDocChunk(id='test_document_1_0',
                                text='I am a test string',
-                               metadata={'test': '1'},
+                               metadata={'test': 1},
                                document_id='test_document_1'),
                     KBDocChunk(id='test_document_1_1',
                                text='with no overlap',
-                               metadata={'test': '1'},
+                               metadata={'test': 1},
                                document_id='test_document_1')]
 
         for actual_chunk, expected_chunk in zip(actual, expected):
diff --git a/tests/unit/context_builder/test_stuffing_context_builder.py b/tests/unit/context_builder/test_stuffing_context_builder.py
index 4881926b..af9c4d74 100644
--- a/tests/unit/context_builder/test_stuffing_context_builder.py
+++ b/tests/unit/context_builder/test_stuffing_context_builder.py
@@ -50,7 +50,7 @@ def setup_method(self):
                         ])
         ]
         self.full_context = Context(
-            content=StuffingContextContent(__root__=[
+            content=StuffingContextContent([
                 ContextQueryResult(query="test query 1",
                                    snippets=[
                                        ContextSnippet(
@@ -80,7 +80,7 @@ def test_context_fits_within_max_tokens(self):
     def test_context_exceeds_max_tokens(self):
         context = self.builder.build(self.query_results, max_context_tokens=30)
 
-        expected_context = Context(content=StuffingContextContent(__root__=[
+        expected_context = Context(content=StuffingContextContent([
             ContextQueryResult(query="test query 1",
                                snippets=[
                                    ContextSnippet(
@@ -102,7 +102,7 @@ def test_context_exceeds_max_tokens_unordered(self):
         self.query_results[0].documents[0].text = self.text1 * 100
         context = self.builder.build(self.query_results, max_context_tokens=20)
 
-        expected_context = Context(content=StuffingContextContent(__root__=[
+        expected_context = Context(content=StuffingContextContent([
             ContextQueryResult(query="test query 2",
                                snippets=[
                                    ContextSnippet(
@@ -118,17 +118,17 @@ def test_context_exceeds_max_tokens_unordered(self):
     def test_whole_query_results_not_fit(self):
         context = self.builder.build(self.query_results, max_context_tokens=10)
         assert context.num_tokens == 1
-        assert context.content == []
+        assert context.content.model_dump() == []
 
     def test_max_tokens_zero(self):
         context = self.builder.build(self.query_results, max_context_tokens=0)
         self.assert_num_tokens(context, 1)
-        assert context.content == []
+        assert context.content.model_dump() == []
 
     def test_empty_query_results(self):
         context = self.builder.build([], max_context_tokens=100)
         self.assert_num_tokens(context, 1)
-        assert context.content == []
+        assert context.content.model_dump() == []
 
     def test_documents_with_duplicates(self):
         duplicate_query_results = self.query_results + [
@@ -173,7 +173,7 @@ def test_empty_documents(self):
         context = self.builder.build(
             empty_query_results, max_context_tokens=100)
         self.assert_num_tokens(context, 1)
-        assert context.content == []
+        assert context.content.model_dump() == []
 
     def assert_num_tokens(self, context: Context, max_tokens: int):
         assert context.num_tokens <= max_tokens
diff --git a/tests/unit/context_engine/test_context_engine.py b/tests/unit/context_engine/test_context_engine.py
index 1ed2b52b..6aad6a75 100644
--- a/tests/unit/context_engine/test_context_engine.py
+++ b/tests/unit/context_engine/test_context_engine.py
@@ -186,11 +186,12 @@ def test_context_query_result_to_text():
     query_result = ContextQueryResult(query="How does photosynthesis work?",
                                       snippets=[ContextSnippet(text="42",
                                                                source="ref")])
-    context = Context(content=StuffingContextContent(__root__=[query_result]),
+    context = Context(content=StuffingContextContent([query_result]),
                       num_tokens=1)
 
-    assert context.to_text() == json.dumps([query_result.dict()])
-    assert context.to_text(indent=2) == json.dumps([query_result.dict()], indent=2)
+    assert context.to_text() == json.dumps([query_result.model_dump()])
+    assert (context.to_text(indent=2) ==
+            json.dumps([query_result.model_dump()], indent=2))
 
 
 @pytest.mark.asyncio
diff --git a/tests/unit/history_pruner/test_raising_history_pruner.py b/tests/unit/history_pruner/test_raising_history_pruner.py
index 03a3956c..42412db3 100644
--- a/tests/unit/history_pruner/test_raising_history_pruner.py
+++ b/tests/unit/history_pruner/test_raising_history_pruner.py
@@ -7,7 +7,9 @@
 
 
 SAMPLE_CONTEXT = Context(content=StringContextContent(
-    __root__="Some context information"), num_tokens=3
+    "Some context information"
+),
+    num_tokens=3
 )
 SYSTEM_PROMPT = "This is a system prompt."
 
diff --git a/tests/unit/history_pruner/test_recent_history_pruner.py b/tests/unit/history_pruner/test_recent_history_pruner.py
index 7cf73a7e..86bb8f5e 100644
--- a/tests/unit/history_pruner/test_recent_history_pruner.py
+++ b/tests/unit/history_pruner/test_recent_history_pruner.py
@@ -7,8 +7,8 @@
 
 
 SAMPLE_CONTEXT = Context(content=StringContextContent(
-    __root__="Some context information"), num_tokens=3
-)
+    "Some context information"
+), num_tokens=3)
 SYSTEM_PROMPT = "This is a system prompt."
 
 
diff --git a/tests/unit/record_encoder/base_test_record_encoder.py b/tests/unit/record_encoder/base_test_record_encoder.py
index 627d12ca..16658e50 100644
--- a/tests/unit/record_encoder/base_test_record_encoder.py
+++ b/tests/unit/record_encoder/base_test_record_encoder.py
@@ -47,14 +47,14 @@ def queries():
     @pytest.fixture
     def expected_encoded_documents(documents, inner_encoder):
         values = inner_encoder.encode_documents([d.text for d in documents])
-        return [KBEncodedDocChunk(**d.dict(), values=v) for d, v in
+        return [KBEncodedDocChunk(**d.model_dump(), values=v) for d, v in
                 zip(documents, values)]
 
     @staticmethod
     @pytest.fixture
     def expected_encoded_queries(queries, inner_encoder):
         values = inner_encoder.encode_queries([q.text for q in queries])
-        return [KBQuery(**q.dict(), values=v) for q, v in zip(queries, values)]
+        return [KBQuery(**q.model_dump(), values=v) for q, v in zip(queries, values)]
 
     @staticmethod
     def test_dimension(record_encoder, expected_dimension):
diff --git a/tests/unit/record_encoder/test_hybrid_record_encoder.py b/tests/unit/record_encoder/test_hybrid_record_encoder.py
new file mode 100644
index 00000000..287f47b6
--- /dev/null
+++ b/tests/unit/record_encoder/test_hybrid_record_encoder.py
@@ -0,0 +1,134 @@
+import tempfile
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+from pinecone_text.sparse import BM25Encoder
+
+from canopy.knowledge_base.models import KBDocChunk
+from canopy.knowledge_base.record_encoder import HybridRecordEncoder, DenseRecordEncoder
+from canopy.models.data_models import Query
+from tests.unit.stubs.stub_dense_encoder import StubDenseEncoder
+
+
+@pytest.fixture(scope="module")
+def documents():
+    return [KBDocChunk(id=f"doc_1_{i}",
+                       text=f"Sample document {i}",
+                       document_id=f"doc_{i}",
+                       metadata={"test": i},
+                       source="doc_1")
+            for i in range(5)]
+
+
+@pytest.fixture(scope="module")
+def queries():
+    return [Query(text="Sample query 1"),
+            Query(text="Sample query 2"),
+            Query(text="Sample query 3")]
+
+
+@pytest.fixture(scope="module")
+def inner_dimension():
+    return 4
+
+
+@pytest.fixture(scope="module")
+def dense_record_encoder(inner_dimension):
+    return DenseRecordEncoder(StubDenseEncoder(dimension=inner_dimension))
+
+
+@pytest.fixture(scope="module")
+def bm_25_encoder_df_path(documents):
+    bm25 = BM25Encoder()
+    bm25.fit([doc.text for doc in documents])
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        tmp_path = str(Path(tmp_dir, "bm25_params.json"))
+        bm25.dump(tmp_path)
+        yield tmp_path
+
+
+@pytest.fixture(scope="module")
+def hybrid_encoder(dense_record_encoder, bm_25_encoder_df_path):
+    return HybridRecordEncoder(dense_record_encoder,
+                               bm_25_encoder_df_path=bm_25_encoder_df_path,
+                               batch_size=2)
+
+
+def test_dimension(hybrid_encoder, inner_dimension):
+    assert hybrid_encoder.dimension == inner_dimension
+
+
+def test_init_encoder_invalid_alpha(dense_record_encoder):
+    with pytest.raises(ValueError):
+        HybridRecordEncoder(dense_record_encoder, alpha=-1)
+    with pytest.raises(ValueError):
+        HybridRecordEncoder(dense_record_encoder, alpha=2)
+    with pytest.raises(ValueError):
+        HybridRecordEncoder(dense_record_encoder, alpha=0, match="sparse only")
+
+
+def test_encode_documents(hybrid_encoder, documents, queries):
+    encoded_documents = hybrid_encoder.encode_documents(documents)
+    for encoded_document in encoded_documents:
+        assert len(encoded_document.values) == hybrid_encoder.dimension
+        assert "indices" in encoded_document.sparse_values
+        assert "values" in encoded_document.sparse_values
+
+
+def test_encode_queries(hybrid_encoder, queries):
+    encoded_queries = hybrid_encoder.encode_queries(queries)
+    assert len(encoded_queries) == len(queries)
+    for encoded_query in encoded_queries:
+        assert len(encoded_query.values) == hybrid_encoder.dimension
+        assert "indices" in encoded_query.sparse_values
+        assert "values" in encoded_query.sparse_values
+
+
+def test_encode_queries_alpha_applied_correctly(dense_record_encoder,
+                                                bm_25_encoder_df_path,
+                                                queries):
+    """
+        Tests whether the alpha value is applied correctly when encoding queries.
+    """
+    alpha = 0.2
+    alpha_coefficient = 2
+
+    hb_1 = HybridRecordEncoder(dense_record_encoder,
+                               bm_25_encoder_df_path=bm_25_encoder_df_path,
+                               alpha=alpha)
+    hb_2 = HybridRecordEncoder(dense_record_encoder,
+                               bm_25_encoder_df_path=bm_25_encoder_df_path,
+                               alpha=alpha_coefficient * alpha)
+
+    encoded_queries = hb_1.encode_queries(queries)
+    encoded_queries_2 = hb_2.encode_queries(queries)
+
+    for encoded_query, encoded_query_2 in zip(encoded_queries, encoded_queries_2):
+        assert len(encoded_query.values) == len(encoded_query_2.values)
+        for value, value_2 in zip(encoded_query.values, encoded_query_2.values):
+            assert pytest.approx(value * alpha_coefficient) == value_2
+
+        assert (encoded_query.sparse_values["indices"] ==
+                encoded_query_2.sparse_values["indices"])
+
+        scaling_coefficient = (1 - alpha_coefficient * alpha) / (1 - alpha)
+        for value, value_2 in zip(encoded_query.sparse_values["values"],
+                                  encoded_query_2.sparse_values["values"]):
+
+            assert pytest.approx(value * scaling_coefficient) == value_2
+
+
+def test_encode_queries_with_alpha_1(hybrid_encoder, dense_record_encoder, queries):
+    """
+        Tests whether the encoded queries are exactly the same as the dense
+        encoded queries when alpha is 1.
+    """
+    with patch.object(hybrid_encoder, '_alpha', new=1.0):
+        encoded_queries = hybrid_encoder.encode_queries(queries)
+        dense_queries = dense_record_encoder.encode_queries(queries)
+
+        assert len(encoded_queries) == len(dense_queries)
+        for encoded_query, dense_query in zip(encoded_queries, dense_queries):
+            assert encoded_query.values == dense_query.values
diff --git a/tests/unit/record_encoder/test_jina_record_encoder.py b/tests/unit/record_encoder/test_jina_record_encoder.py
index 9798fb38..203b87eb 100644
--- a/tests/unit/record_encoder/test_jina_record_encoder.py
+++ b/tests/unit/record_encoder/test_jina_record_encoder.py
@@ -30,13 +30,16 @@ def encoder():
 def test_dimension(encoder):
     with patch('pinecone_text.dense.JinaEncoder.encode_documents') \
             as mock_encode_documents:
-        mock_encode_documents.return_value = [[0.1, 0.2, 0.3]]
+        mock_encode_documents.return_value = [0.1, 0.2, 0.3]
         assert encoder.dimension == 3
 
 
 def custom_encode(*args, **kwargs):
     input_to_encode = args[0]
-    return [[0.1, 0.2, 0.3] for _ in input_to_encode]
+    if isinstance(input_to_encode, list):
+        return [[0.1, 0.2, 0.3] for _ in input_to_encode]
+    else:
+        return [0.1, 0.2, 0.3]
 
 
 @pytest.mark.parametrize("items,function",
diff --git a/tests/unit/record_encoder/test_sentence_transformers_encoder.py b/tests/unit/record_encoder/test_sentence_transformers_encoder.py
new file mode 100644
index 00000000..de57eb90
--- /dev/null
+++ b/tests/unit/record_encoder/test_sentence_transformers_encoder.py
@@ -0,0 +1,77 @@
+import pytest
+
+from canopy.knowledge_base.models import KBDocChunk
+from canopy.knowledge_base.record_encoder.sentence_transformers import (
+    SentenceTransformerRecordEncoder
+)
+from canopy.models.data_models import Query
+
+from unittest.mock import patch
+
+documents = [KBDocChunk(
+    id=f"doc_1_{i}",
+    text=f"Sample document {i}",
+    document_id=f"doc_{i}",
+    metadata={"test": i},
+    source="doc_1",
+)
+    for i in range(4)
+]
+
+queries = [Query(text="Sample query 1"),
+           Query(text="Sample query 2"),
+           Query(text="Sample query 3"),
+           Query(text="Sample query 4")]
+
+
+@pytest.fixture
+def encoder():
+    try:
+        encoder = SentenceTransformerRecordEncoder(batch_size=2)
+    except ImportError:
+        pytest.skip(
+            "`transformers` extra not installed. Skipping SentenceTransformer unit "
+            "tests"
+        )
+    return encoder
+
+
+def test_dimension(encoder):
+    with patch('pinecone_text.dense.SentenceTransformerEncoder.encode_documents') \
+            as mock_encode_documents:
+        mock_encode_documents.return_value = [0.1, 0.2, 0.3]
+        assert encoder.dimension == 3
+
+
+def custom_encode(*args, **kwargs):
+    input_to_encode = args[0]
+    if isinstance(input_to_encode, list):
+        return [[0.1, 0.2, 0.3] for _ in input_to_encode]
+    else:
+        return [0.1, 0.2, 0.3]
+
+
+@pytest.mark.parametrize("items,function",
+                         [(documents, "encode_documents"),
+                          (queries, "encode_queries"),
+                          ([], "encode_documents"),
+                          ([], "encode_queries")])
+def test_encode_documents(encoder, items, function):
+    with patch('pinecone_text.dense.SentenceTransformerEncoder.encode_documents',
+               side_effect=custom_encode):
+        with patch('pinecone_text.dense.SentenceTransformerEncoder.encode_queries',
+                   side_effect=custom_encode):
+            encoded_documents = getattr(encoder, function)(items)
+
+            assert len(encoded_documents) == len(items)
+            assert all(len(encoded.values) == encoder.dimension
+                       for encoded in encoded_documents)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("items,function",
+                         [("aencode_documents", documents),
+                          ("aencode_queries", queries)])
+async def test_aencode_not_implemented(encoder, function, items):
+    with pytest.raises(NotImplementedError):
+        await encoder.aencode_queries(items)
diff --git a/tests/unit/stubs/stub_chunker.py b/tests/unit/stubs/stub_chunker.py
index 7f6058ad..68343660 100644
--- a/tests/unit/stubs/stub_chunker.py
+++ b/tests/unit/stubs/stub_chunker.py
@@ -15,7 +15,7 @@ def chunk_single_document(self, document: Document) -> List[KBDocChunk]:
             return []
 
         # simply duplicate docs as chunks
-        return [KBDocChunk(id=f"{document.id}_{i}",
+        return [KBDocChunk(id=self.generate_chunk_id(document.id, i),
                            document_id=document.id,
                            text=document.text + (f" dup_{i}" if i > 0 else ""),
                            source=document.source,
diff --git a/tests/unit/stubs/stub_record_encoder.py b/tests/unit/stubs/stub_record_encoder.py
index 2b77f170..7222d0df 100644
--- a/tests/unit/stubs/stub_record_encoder.py
+++ b/tests/unit/stubs/stub_record_encoder.py
@@ -22,7 +22,7 @@ def _encode_documents_batch(self,
             values = self._dense_encoder.encode_documents(doc.text)
             result.append(
                 KBEncodedDocChunk(
-                    **doc.dict(),
+                    **doc.model_dump(),
                     values=values))
         return result
 
@@ -33,7 +33,7 @@ def _encode_queries_batch(self,
         for query in queries:
             values = self._dense_encoder.encode_queries(query.text)
             result.append(
-                KBQuery(**query.dict(),
+                KBQuery(**query.model_dump(),
                         values=values))
         return result
 
diff --git a/tests/unit/tokenizer/test_cohere_hf_tokenizer.py b/tests/unit/tokenizer/test_cohere_hf_tokenizer.py
new file mode 100644
index 00000000..f5f9671d
--- /dev/null
+++ b/tests/unit/tokenizer/test_cohere_hf_tokenizer.py
@@ -0,0 +1,66 @@
+import pytest
+from canopy.tokenizer import CohereHFTokenizer
+from canopy.models.data_models import MessageBase, Role
+from .base_test_tokenizer import BaseTestTokenizer
+
+
+class TestCohereHFTokenizer(BaseTestTokenizer):
+    @staticmethod
+    @pytest.fixture(scope="class")
+    def tokenizer():
+        try:
+            tokenizer = CohereHFTokenizer()
+        except ImportError:
+            pytest.skip(
+                "`cohere` extra not installed. Skipping CohereHFTokenizer unit "
+                "tests"
+            )
+        return tokenizer
+
+    @staticmethod
+    @pytest.fixture
+    def expected_tokens(text):
+        return [
+            'string',
+            'Ġwith',
+            'Ġspecial',
+            'Ġcharacters',
+            'Ġlike',
+            'Ġ!',
+            '@',
+            '#',
+            '$',
+            '%',
+            '^',
+            '&',
+            '*',
+            '()',
+            '_',
+            '+',
+            'Ġæ',
+            'Ĺ',
+            '¥',
+            'æľ¬',
+            'Ġspaces',
+            'ĠĠĠ',
+            'ĊĠ',
+            'ĊĊ',
+            'ĠCASE',
+            'Ġc',
+            'A',
+            'se',
+            'Ġ',
+        ]
+
+    @staticmethod
+    def test_messages_token_count(tokenizer):
+        messages = [MessageBase(role=Role.USER, content="Hello, assistant.")]
+        assert tokenizer.messages_token_count(messages) == 11
+
+        messages = [
+            MessageBase(role=Role.USER, content="Hello, assistant."),
+            MessageBase(
+                role=Role.ASSISTANT, content="Hello, user. How can I assist you?"
+            ),
+        ]
+        assert tokenizer.messages_token_count(messages) == 25
diff --git a/tests/unit/tokenizer/test_llama_tokenizer.py b/tests/unit/tokenizer/test_llama_tokenizer.py
index cc0a838f..efd5e7d1 100644
--- a/tests/unit/tokenizer/test_llama_tokenizer.py
+++ b/tests/unit/tokenizer/test_llama_tokenizer.py
@@ -8,7 +8,14 @@ class TestLlamaTokenizer(BaseTestTokenizer):
     @staticmethod
     @pytest.fixture(scope="class")
     def tokenizer():
-        return LlamaTokenizer(model_name="hf-internal-testing/llama-tokenizer")
+        try:
+            tokenizer = LlamaTokenizer(model_name="hf-internal-testing/llama-tokenizer")
+        except ImportError:
+            pytest.skip(
+                "`transformers` extra not installed. Skipping LLamaTokenizer unit "
+                "tests"
+            )
+        return tokenizer
 
     @staticmethod
     @pytest.fixture