From 9842d55bbfa03990e220519dc7cbbf1949875779 Mon Sep 17 00:00:00 2001 From: Lutchy Horace Date: Wed, 16 Apr 2025 16:57:39 -0400 Subject: [PATCH] Initial code commit --- README.md | 91 +++++++++------ config.ini | 19 ++++ config.ini.example | 18 +++ config.yaml.example | 32 ++++++ lancedb-context.service | 15 +++ lancedb_context_provider.py | 81 ++++++++++++++ lancedb_ingest.py | 217 ++++++++++++++++++++++++++++++++++++ 7 files changed, 441 insertions(+), 32 deletions(-) create mode 100644 config.ini create mode 100644 config.ini.example create mode 100644 config.yaml.example create mode 100644 lancedb-context.service create mode 100755 lancedb_context_provider.py create mode 100644 lancedb_ingest.py diff --git a/README.md b/README.md index da21137..5a7a24d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,3 @@ -Here’s a polished `README.md` for your project **CodeRecall**: --- @@ -26,11 +25,11 @@ No cloud APIs. No latency. Full control. ``` CodeRecall/ -├── chroma_ingest.py # Ingest codebase + Git into ChromaDB -├── chroma_context_provider.py # VS Code Continue context provider -├── config.ini # Ollama + Chroma settings -├── chroma-db/ # ChromaDB persistence directory -└── config.json # Continue extension config +├── lancedb_ingest.py # Ingest codebase + Git into ChromaDB +├── lancedb_context_provider.py # VS Code Continue context provider +├── config.ini.example # Ollama + LanceDB settings +├── lancedb-data/ # LanceDB persistence directory +└── config.json # Continue extension config ``` --- @@ -40,7 +39,7 @@ CodeRecall/ ### 1. Install dependencies ```bash -pip install chromadb requests +pip install lancedb ``` Make sure you have: @@ -51,11 +50,24 @@ Make sure you have: ### 2. Configure `config.ini` ```ini -[ollama] +[[ollama] url = http://localhost:11434 -[chroma] -persist_directory = ./chroma-db +[lancedb] +persist_directory = ./lancedb-data + +[s3] +enable = True +bucket_name = my-s3-bucket +access_key_id = my-access-key +secret_access_key = my-secret-key +region = us-east-1 +# Optional, if using third party s3 providers +endpoint = http://minio:9000 + +[server] +host = 0.0.0.0 +port = 8080 ``` --- @@ -63,7 +75,7 @@ persist_directory = ./chroma-db ## 📥 Ingest your project ```bash -python chroma_ingest.py +python lancedb_ingest.py ``` This loads: @@ -75,26 +87,41 @@ This loads: ## 🧠 Add as a VS Code Context Provider -### `config.json` for Continue +### `config.yaml` for Continue -```json -{ - "models": [ - { - "title": "LLaMA 3 (Ollama)", - "provider": "ollama", - "model": "llama3", - "apiBase": "http://localhost:11434" - } - ], - "contextProviders": [ - { - "title": "ChromaDB Search", - "provider": "custom", - "path": "./chroma_context_provider.py" - } - ] -} +```yaml +name: Local Assistant +version: 1.0.0 +schema: v1 +models: + - name: Ollama Autodetect + provider: ollama + model: AUTODETECT + apiBase: http://localhost:11434 + - name: Ollama Autocomplete + provider: ollama + model: qwen2.5-coder:1.5b-base + apiBase: http://localhost:11434 + roles: + - autocomplete + - name: Nomic Embed Text + provider: ollama + model: nomic-embed-text + apiBase: http://localhost:11434 + roles: + - embed +context: + - provider: code + - provider: docs + - provider: diff + - provider: terminal + - provider: problems + - provider: folder + - provider: codebase + # LanceDB Context Provider + - provider: http + params: + url: http://localhost/retrieve ``` --- @@ -103,7 +130,7 @@ This loads: 1. Launch VS Code. 2. Open the Continue sidebar. -3. Set `"ChromaDB Search"` as your context provider. +3. Set `"@HTTP"` as your context provider. 4. Ask your model questions about your codebase, architecture, or commits. Example prompt: @@ -114,7 +141,7 @@ Example prompt: ## 📌 Notes - Default embedding model is `nomic-embed-text` (via Ollama). -- Change `n_results` in `chroma_context_provider.py` for broader/narrower context. +- Change `n_results` in `lancedb_context_provider.py` for broader/narrower context. - Works offline, no API keys required. --- diff --git a/config.ini b/config.ini new file mode 100644 index 0000000..0b562a2 --- /dev/null +++ b/config.ini @@ -0,0 +1,19 @@ +[ollama] +url = https://ollama-dev-lxc.int.lhprojects.net + +[server] +host = 127.0.0.1 +port = 8080 + +[lancedb] +persist_directory = ./lancedb + +[s3] +bucket_name = lancedb +access_key_id = qW8S1xub7zhxHtItT9WG +secret_access_key = HEqQuNJXn7uR17h8T4QvmUbwdswe9PeKJy7f5wIp +region = us-east-1 +enable = True +# Optional, only set if s3_type is 'other' +endpoint = https://s3.minidrive.cloud + diff --git a/config.ini.example b/config.ini.example new file mode 100644 index 0000000..68ad866 --- /dev/null +++ b/config.ini.example @@ -0,0 +1,18 @@ +[ollama] +url = http://localhost:11434 + +[lancedb] +persist_directory = ./lancedb-data + +[s3] +enable = True +bucket_name = my-s3-bucket +access_key_id = my-access-key +secret_access_key = my-secret-key +region = us-east-1 +# Optional, if using third party s3 providers +endpoint = http://minio:9000 + +[server] +host = 0.0.0.0 +port = 8080 \ No newline at end of file diff --git a/config.yaml.example b/config.yaml.example new file mode 100644 index 0000000..4111c19 --- /dev/null +++ b/config.yaml.example @@ -0,0 +1,32 @@ +name: Local Assistant +version: 1.0.0 +schema: v1 +models: + - name: Ollama Autodetect + provider: ollama + model: AUTODETECT + apiBase: http://localhost:11434 + - name: Ollama Autocomplete + provider: ollama + model: qwen2.5-coder:1.5b-base + apiBase: http://localhost:11434 + roles: + - autocomplete + - name: Nomic Embed Text + provider: ollama + model: nomic-embed-text + apiBase: http://localhost:11434 + roles: + - embed +context: + - provider: code + - provider: docs + - provider: diff + - provider: terminal + - provider: problems + - provider: folder + - provider: codebase + # LanceDB Context Provider + - provider: http + params: + url: http://localhost/retrieve diff --git a/lancedb-context.service b/lancedb-context.service new file mode 100644 index 0000000..37690ba --- /dev/null +++ b/lancedb-context.service @@ -0,0 +1,15 @@ +[Unit] +Description=LanceDB Context Provider API for Continue +After=network.target + +[Service] +Type=simple +User=your-username +WorkingDirectory=/path/to/your/project +ExecStart=/path/to/your/project/lancedb_context_provider.py --config $CONFIG_PATH +EnvironmentFile=-/etc/default/lancedb_context_provider +Restart=on-failure +RestartSec=5 + +[Install] +WantedBy=multi-user.target \ No newline at end of file diff --git a/lancedb_context_provider.py b/lancedb_context_provider.py new file mode 100755 index 0000000..a58743e --- /dev/null +++ b/lancedb_context_provider.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +from fastapi import FastAPI, Depends +from pydantic import BaseModel +import lancedb +from lancedb.embeddings.ollama import OllamaEmbeddings +import configparser +import argparse +import os +import uvicorn + +app = FastAPI() + +def load_config(args): + config = configparser.ConfigParser() + config.read(args.config) + return config + +def setup_database(config): + s3_bucket = config.get("s3", "bucket_name", fallback=None) + enable_s3 = config.get("s3", "enable", fallback="false").lower() in ['true', 'yes', '1', 'on'] + persist_directory = config.get("lancedb", "persist_directory", fallback="./lancedb") + + if s3_bucket and enable_s3: + storage_options = { + "aws_access_key_id": config.get("s3", "access_key_id", fallback=None), + "aws_secret_access_key": config.get("s3", "secret_access_key", fallback=None), + "region": config.get("s3", "region", fallback="us-east-1"), + "endpoint_url": config.get("s3", "endpoint", fallback=None), + } + db = lancedb.connect( + f"s3://{s3_bucket}", + storage_options=storage_options + ) + else: + db = lancedb.connect(persist_directory) + + return db + +class ContextProviderInput(BaseModel): + query: str + fullInput: str + +@app.post("/retrieve") +async def retrieve_context(input: ContextProviderInput, embedding_function: OllamaEmbeddings = Depends(lambda: ollama_ef)): + # Generate embedding from the query + query_embedding = embedding_function.generate_embeddings([input.query])[0] + + # Search for similar documents + results = table.search(query_embedding).distance_type("cosine").limit(3).to_list() + + # Create a list of context items + context_items = [] + for result in results: + context_items.append({ + "name": result.get("id", "unknown"), + "description": result.get("description", "document"), + "content": result["text"] + }) + return context_items + +if __name__ == "__main__": + # Parse command line arguments + parser = argparse.ArgumentParser() + parser.add_argument("--config", default="config.ini", help="Path to config file") + args, _ = parser.parse_known_args() + + # Load configuration and create database connection + config = load_config(args) + db = setup_database(config) + + # Open table + table = db.open_table("vectordb") + + # Initialize Ollama embedding function + ollama_url = config.get("ollama", "url") + ollama_ef = OllamaEmbeddings(model="nomic-embed-text", url=ollama_url) + + # Run the application + host = config.get("server", "host") + port = int(config.get("server", "port")) + uvicorn.run(app, host=host, port=port) \ No newline at end of file diff --git a/lancedb_ingest.py b/lancedb_ingest.py new file mode 100644 index 0000000..614da07 --- /dev/null +++ b/lancedb_ingest.py @@ -0,0 +1,217 @@ +#!/usr/bin/env python3 +import argparse +import subprocess +import os +import json +import pyarrow as pa +import time +import hashlib +import signal +import configparser +import lancedb +import mimetypes +import glob +from lancedb.embeddings.ollama import OllamaEmbeddings +from functools import wraps + +# Ignore warnings +os.environ['RUST_LOG'] = 'error' + +def handle_signals(signame): + def signal_handler(sig, frame): + print(f"\nReceived {signame} signal. Exiting...") + quit(0) + return signal_handler + +# Register signal handlers +signal.signal(signal.SIGINT, handle_signals("SIGINT")) +signal.signal(signal.SIGTERM, handle_signals("SIGTERM")) + +def load_config(args): + config = configparser.ConfigParser() + config.read(args.config) + return config + +def setup_database(config): + s3_bucket = config.get("s3", "bucket_name", fallback=None) + enable_s3 = config.get("s3", "enable", fallback="false").lower() in ['true', 'yes', '1', 'on'] + persist_directory = config.get("lancedb", "persist_directory", fallback="./lancedb") + + if s3_bucket and enable_s3: + storage_options = { + "aws_access_key_id": config.get("s3", "access_key_id", fallback=None), + "aws_secret_access_key": config.get("s3", "secret_access_key", fallback=None), + "region": config.get("s3", "region", fallback="us-east-1"), + "endpoint_url": config.get("s3", "endpoint", fallback=None), + } + db = lancedb.connect( + f"s3://{s3_bucket}", + storage_options=storage_options + ) + else: + db = lancedb.connect(persist_directory) + + return db + +def setup_embedding_model(config): + ollama_url = config.get("ollama", "url", fallback="http://localhost:11434") + embedding_model = OllamaEmbeddings( + host=ollama_url, + name='nomic-embed-text', + options=None, # type=o.llama.Options + keep_alive=None, + ollama_client_kwargs={ + 'verify': False # Disable certificate verification (not recommended) + } + ) + return embedding_model + +def create_table(db): + table_name = "vectordb" + schema_dict = pa.schema([ + pa.field("text", pa.string()), + pa.field("id", pa.string()), + pa.field("description", pa.string()), + pa.field("vector", pa.list_(pa.float64(), 768)) + ]) + + try: + table = db.open_table(table_name) + except ValueError as e: + if "Table '" in str(e) and "' was not found" in str(e): + print(f"Table '{table_name}' not found. Creating...") + # Convert dictionary schema to PyArrow schema + schema = pa.schema(schema_dict) + table = db.create_table(table_name, schema=schema, mode="overwrite") + else: + quit(f"A error occurred when opening table: {e}") + return table + +def is_git_directory(path="."): + #print(f"path: {path}") + return subprocess.call(['git', 'rev-parse', '--is-inside-work-tree'], cwd=path, stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0 + +def load_documents(root=None, exclude=None): + documents = [] + + # Split exclude into patterns if provided + exclude_patterns = (exclude or "").split("|") if exclude else [] + + # Check if root is None and set it to the current directory if not provided + if root is None: + root = os.getcwd() + + # Iterate through directories and files + for root, dirs, files in os.walk(root): + + # Skip files specified in exclude + files = [f for f in files if not any(glob.fnmatch.fnmatch(f"{root}/{f}", pattern) for pattern in exclude_patterns)] + + for file in files: + path = os.path.join(root, file) + try: + with open(path, "rb") as f: + content_bytes = f.read() + content_type, _ = mimetypes.guess_type(f.name) + + # Decode the content to UTF-8 + content_str = content_bytes.decode("utf-8", errors='ignore') + + # Explicitly treat application/json as text/plain + if 'application/json' == content_type: + content_type = "text/plain" + + # Fallback check if the guessed MIME type is None or not text + if content_type is None or 'text' not in content_type: + if not any(char in content_str for char in "\n\r\t\v\f "): + continue + + description = "" + #print(f"path: {f.name}; root: {root}; f.name: {f.name}; content_type: {content_type}; file: {file};") + if is_git_directory(root): + try: + description = subprocess.check_output(["git", "show", "--no-patch", path], stderr=subprocess.DEVNULL).decode("utf-8").strip() or "" + except subprocess.CalledProcessError as e: + print(f"Error fetching git description for {path}: {e}") + + print(f"Documents found '{f.name}'.") + doc_id = hashlib.sha256(f"{os.path.dirname(path)}{path}".encode()).hexdigest() + documents.append({"text": content_str, "id": doc_id, "description": description}) + except Exception as e: + print(f"Error reading file {path}: {e}") + return documents + +def load_git_data(): + if not is_git_directory(): + print("Current directory is not a Git repository.") + return [] + + log_entries = subprocess.check_output([ + "git", "log", "--pretty=format:%h %s", "--no-merges" + ], stderr=subprocess.DEVNULL, text=True).strip().split("\n") + + git_documents = [] + for entry in log_entries: + commit_hash, message = entry.split(maxsplit=1) + description = subprocess.check_output(["git", "show", "--no-patch", f"{commit_hash}"], stderr=subprocess.DEVNULL).decode("utf-8").strip() or "" + git_documents.append({"text": f"Commit {commit_hash}: {message}", "id": commit_hash, "description": description}) + + return git_documents + +def generate_embeddings(documents, embedding_model): + print("Generating embeddings...") + for doc in documents: + text = doc["text"] + doc_id = doc["id"] + embedding = embedding_model.generate_embeddings([text])[0] + doc["vector"] = embedding + print("Done.") + return documents + +def upsert_documents(table, documents): + table.merge_insert("id").when_matched_update_all().when_not_matched_insert_all().execute(documents) + affected_rows = table.count_rows() + print(f"Inserted {affected_rows} documents.") + +def create_vector_index(table): + try: + print(f"Creating vector index") + table.create_index(metric="cosine", vector_column_name="vector") + print("Vector index created successfully.") + except Exception as e: + quit(f"Error creating vector index: {e}") + +def wait_for_index(table, index_name): + POLL_INTERVAL = 10 + while True: + indices = table.list_indices() + if indices and any(index.name == index_name for index in indices): + break + print(f"Waiting for {index_name} to be ready...") + time.sleep(POLL_INTERVAL) + print(f"Vector index {index_name} is ready!") + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--config", default="config.ini", help="Path to config file") + parser.add_argument("--root", type=str, help="Root directory to process") + parser.add_argument("--exclude", type=str, help="Exclude patterns separated by '|'") + args, _ = parser.parse_known_args() + + config = load_config(args) + db = setup_database(config) + embedding_model = setup_embedding_model(config) + table = create_table(db) + + documents = load_documents(root=args.root, exclude=args.exclude) + git_documents = load_git_data() + documents.extend(git_documents) + + documents = generate_embeddings(documents, embedding_model) + upsert_documents(table, documents) + create_vector_index(table) + wait_for_index(table, "vector_idx") + print("Documents inserted.") + +if __name__ == "__main__": + main() \ No newline at end of file