From 9842d55bbfa03990e220519dc7cbbf1949875779 Mon Sep 17 00:00:00 2001
From: Lutchy Horace <bugzbunny@bugzbunny.net>
Date: Wed, 16 Apr 2025 16:57:39 -0400
Subject: [PATCH] Initial code commit

---
 README.md                   |  91 +++++++++------
 config.ini                  |  19 ++++
 config.ini.example          |  18 +++
 config.yaml.example         |  32 ++++++
 lancedb-context.service     |  15 +++
 lancedb_context_provider.py |  81 ++++++++++++++
 lancedb_ingest.py           | 217 ++++++++++++++++++++++++++++++++++++
 7 files changed, 441 insertions(+), 32 deletions(-)
 create mode 100644 config.ini
 create mode 100644 config.ini.example
 create mode 100644 config.yaml.example
 create mode 100644 lancedb-context.service
 create mode 100755 lancedb_context_provider.py
 create mode 100644 lancedb_ingest.py

diff --git a/README.md b/README.md
index da21137..5a7a24d 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,3 @@
-Here’s a polished `README.md` for your project **CodeRecall**:
 
 ---
 
@@ -26,11 +25,11 @@ No cloud APIs. No latency. Full control.
 
 ```
 CodeRecall/
-├── chroma_ingest.py           # Ingest codebase + Git into ChromaDB
-├── chroma_context_provider.py # VS Code Continue context provider
-├── config.ini                 # Ollama + Chroma settings
-├── chroma-db/                 # ChromaDB persistence directory
-└── config.json                # Continue extension config
+├── lancedb_ingest.py           # Ingest codebase + Git into ChromaDB
+├── lancedb_context_provider.py # VS Code Continue context provider
+├── config.ini.example          # Ollama + LanceDB settings
+├── lancedb-data/                  # LanceDB persistence directory
+└── config.json                 # Continue extension config
 ```
 
 ---
@@ -40,7 +39,7 @@ CodeRecall/
 ### 1. Install dependencies
 
 ```bash
-pip install chromadb requests
+pip install lancedb
 ```
 
 Make sure you have:
@@ -51,11 +50,24 @@ Make sure you have:
 ### 2. Configure `config.ini`
 
 ```ini
-[ollama]
+[[ollama]
 url = http://localhost:11434
 
-[chroma]
-persist_directory = ./chroma-db
+[lancedb]
+persist_directory = ./lancedb-data
+
+[s3]
+enable = True
+bucket_name = my-s3-bucket 
+access_key_id = my-access-key
+secret_access_key = my-secret-key
+region = us-east-1
+# Optional, if using third party s3 providers
+endpoint = http://minio:9000 
+
+[server]
+host = 0.0.0.0
+port = 8080
 ```
 
 ---
@@ -63,7 +75,7 @@ persist_directory = ./chroma-db
 ## 📥 Ingest your project
 
 ```bash
-python chroma_ingest.py
+python lancedb_ingest.py
 ```
 
 This loads:
@@ -75,26 +87,41 @@ This loads:
 
 ## 🧠 Add as a VS Code Context Provider
 
-### `config.json` for Continue
+### `config.yaml` for Continue
 
-```json
-{
-  "models": [
-    {
-      "title": "LLaMA 3 (Ollama)",
-      "provider": "ollama",
-      "model": "llama3",
-      "apiBase": "http://localhost:11434"
-    }
-  ],
-  "contextProviders": [
-    {
-      "title": "ChromaDB Search",
-      "provider": "custom",
-      "path": "./chroma_context_provider.py"
-    }
-  ]
-}
+```yaml
+name: Local Assistant
+version: 1.0.0
+schema: v1
+models: 
+  - name: Ollama Autodetect
+    provider: ollama
+    model: AUTODETECT
+    apiBase: http://localhost:11434
+  - name: Ollama Autocomplete
+    provider: ollama
+    model: qwen2.5-coder:1.5b-base
+    apiBase: http://localhost:11434
+        roles:
+      - autocomplete
+  - name: Nomic Embed Text
+    provider: ollama
+    model: nomic-embed-text
+    apiBase: http://localhost:11434
+    roles:
+      - embed
+context:
+  - provider: code
+  - provider: docs
+  - provider: diff
+  - provider: terminal
+  - provider: problems
+  - provider: folder
+  - provider: codebase
+  # LanceDB Context Provider
+  - provider: http
+    params:
+      url: http://localhost/retrieve
 ```
 
 ---
@@ -103,7 +130,7 @@ This loads:
 
 1. Launch VS Code.
 2. Open the Continue sidebar.
-3. Set `"ChromaDB Search"` as your context provider.
+3. Set `"@HTTP"` as your context provider.
 4. Ask your model questions about your codebase, architecture, or commits.
 
 Example prompt:
@@ -114,7 +141,7 @@ Example prompt:
 ## 📌 Notes
 
 - Default embedding model is `nomic-embed-text` (via Ollama).
-- Change `n_results` in `chroma_context_provider.py` for broader/narrower context.
+- Change `n_results` in `lancedb_context_provider.py` for broader/narrower context.
 - Works offline, no API keys required.
 
 ---
diff --git a/config.ini b/config.ini
new file mode 100644
index 0000000..0b562a2
--- /dev/null
+++ b/config.ini
@@ -0,0 +1,19 @@
+[ollama]
+url = https://ollama-dev-lxc.int.lhprojects.net
+
+[server]
+host = 127.0.0.1
+port = 8080
+
+[lancedb]
+persist_directory = ./lancedb
+
+[s3]
+bucket_name = lancedb
+access_key_id = qW8S1xub7zhxHtItT9WG
+secret_access_key = HEqQuNJXn7uR17h8T4QvmUbwdswe9PeKJy7f5wIp
+region = us-east-1
+enable = True
+# Optional, only set if s3_type is 'other'
+endpoint = https://s3.minidrive.cloud
+
diff --git a/config.ini.example b/config.ini.example
new file mode 100644
index 0000000..68ad866
--- /dev/null
+++ b/config.ini.example
@@ -0,0 +1,18 @@
+[ollama]
+url = http://localhost:11434
+
+[lancedb]
+persist_directory = ./lancedb-data
+
+[s3]
+enable = True
+bucket_name = my-s3-bucket 
+access_key_id = my-access-key
+secret_access_key = my-secret-key
+region = us-east-1
+# Optional, if using third party s3 providers
+endpoint = http://minio:9000 
+
+[server]
+host = 0.0.0.0
+port = 8080
\ No newline at end of file
diff --git a/config.yaml.example b/config.yaml.example
new file mode 100644
index 0000000..4111c19
--- /dev/null
+++ b/config.yaml.example
@@ -0,0 +1,32 @@
+name: Local Assistant
+version: 1.0.0
+schema: v1
+models: 
+  - name: Ollama Autodetect
+    provider: ollama
+    model: AUTODETECT
+    apiBase: http://localhost:11434
+  - name: Ollama Autocomplete
+    provider: ollama
+    model: qwen2.5-coder:1.5b-base
+    apiBase: http://localhost:11434
+        roles:
+      - autocomplete
+  - name: Nomic Embed Text
+    provider: ollama
+    model: nomic-embed-text
+    apiBase: http://localhost:11434
+    roles:
+      - embed
+context:
+  - provider: code
+  - provider: docs
+  - provider: diff
+  - provider: terminal
+  - provider: problems
+  - provider: folder
+  - provider: codebase
+  # LanceDB Context Provider
+  - provider: http
+    params:
+      url: http://localhost/retrieve
diff --git a/lancedb-context.service b/lancedb-context.service
new file mode 100644
index 0000000..37690ba
--- /dev/null
+++ b/lancedb-context.service
@@ -0,0 +1,15 @@
+[Unit]
+Description=LanceDB Context Provider API for Continue
+After=network.target
+
+[Service]
+Type=simple
+User=your-username
+WorkingDirectory=/path/to/your/project
+ExecStart=/path/to/your/project/lancedb_context_provider.py --config $CONFIG_PATH
+EnvironmentFile=-/etc/default/lancedb_context_provider
+Restart=on-failure
+RestartSec=5
+
+[Install]
+WantedBy=multi-user.target
\ No newline at end of file
diff --git a/lancedb_context_provider.py b/lancedb_context_provider.py
new file mode 100755
index 0000000..a58743e
--- /dev/null
+++ b/lancedb_context_provider.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+from fastapi import FastAPI, Depends
+from pydantic import BaseModel
+import lancedb
+from lancedb.embeddings.ollama import OllamaEmbeddings
+import configparser
+import argparse
+import os
+import uvicorn
+
+app = FastAPI()
+
+def load_config(args):
+    config = configparser.ConfigParser()
+    config.read(args.config)
+    return config
+
+def setup_database(config):
+    s3_bucket = config.get("s3", "bucket_name", fallback=None)
+    enable_s3 = config.get("s3", "enable", fallback="false").lower() in ['true', 'yes', '1', 'on']
+    persist_directory = config.get("lancedb", "persist_directory", fallback="./lancedb")
+
+    if s3_bucket and enable_s3:
+        storage_options = {
+            "aws_access_key_id": config.get("s3", "access_key_id", fallback=None),
+            "aws_secret_access_key": config.get("s3", "secret_access_key", fallback=None),
+            "region": config.get("s3", "region", fallback="us-east-1"),
+            "endpoint_url": config.get("s3", "endpoint", fallback=None),
+        }
+        db = lancedb.connect(
+            f"s3://{s3_bucket}",
+            storage_options=storage_options
+        )
+    else:
+        db = lancedb.connect(persist_directory)
+    
+    return db
+
+class ContextProviderInput(BaseModel):
+    query: str
+    fullInput: str
+
+@app.post("/retrieve")
+async def retrieve_context(input: ContextProviderInput, embedding_function: OllamaEmbeddings = Depends(lambda: ollama_ef)):
+    # Generate embedding from the query
+    query_embedding = embedding_function.generate_embeddings([input.query])[0]
+
+    # Search for similar documents
+    results = table.search(query_embedding).distance_type("cosine").limit(3).to_list()
+
+    # Create a list of context items
+    context_items = []
+    for result in results:
+        context_items.append({
+            "name": result.get("id", "unknown"),
+            "description": result.get("description", "document"),
+            "content": result["text"]
+        })
+    return context_items
+
+if __name__ == "__main__":
+    # Parse command line arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", default="config.ini", help="Path to config file")
+    args, _ = parser.parse_known_args()
+
+    # Load configuration and create database connection
+    config = load_config(args)
+    db = setup_database(config)
+
+    # Open table
+    table = db.open_table("vectordb")
+
+    # Initialize Ollama embedding function
+    ollama_url = config.get("ollama", "url")
+    ollama_ef = OllamaEmbeddings(model="nomic-embed-text", url=ollama_url)
+
+    # Run the application
+    host = config.get("server", "host")
+    port = int(config.get("server", "port"))
+    uvicorn.run(app, host=host, port=port)
\ No newline at end of file
diff --git a/lancedb_ingest.py b/lancedb_ingest.py
new file mode 100644
index 0000000..614da07
--- /dev/null
+++ b/lancedb_ingest.py
@@ -0,0 +1,217 @@
+#!/usr/bin/env python3
+import argparse
+import subprocess
+import os
+import json
+import pyarrow as pa
+import time
+import hashlib
+import signal
+import configparser
+import lancedb
+import mimetypes
+import glob
+from lancedb.embeddings.ollama import OllamaEmbeddings
+from functools import wraps
+
+# Ignore warnings
+os.environ['RUST_LOG'] = 'error'
+
+def handle_signals(signame):
+    def signal_handler(sig, frame):
+        print(f"\nReceived {signame} signal. Exiting...")
+        quit(0)
+    return signal_handler
+
+# Register signal handlers
+signal.signal(signal.SIGINT, handle_signals("SIGINT"))
+signal.signal(signal.SIGTERM, handle_signals("SIGTERM"))
+
+def load_config(args):
+    config = configparser.ConfigParser()
+    config.read(args.config)
+    return config
+
+def setup_database(config):
+    s3_bucket = config.get("s3", "bucket_name", fallback=None)
+    enable_s3 = config.get("s3", "enable", fallback="false").lower() in ['true', 'yes', '1', 'on']
+    persist_directory = config.get("lancedb", "persist_directory", fallback="./lancedb")
+
+    if s3_bucket and enable_s3:
+        storage_options = {
+            "aws_access_key_id": config.get("s3", "access_key_id", fallback=None),
+            "aws_secret_access_key": config.get("s3", "secret_access_key", fallback=None),
+            "region": config.get("s3", "region", fallback="us-east-1"),
+            "endpoint_url": config.get("s3", "endpoint", fallback=None),
+        }
+        db = lancedb.connect(
+            f"s3://{s3_bucket}",
+            storage_options=storage_options
+        )
+    else:
+        db = lancedb.connect(persist_directory)
+    
+    return db
+
+def setup_embedding_model(config):
+    ollama_url = config.get("ollama", "url", fallback="http://localhost:11434")
+    embedding_model = OllamaEmbeddings(
+        host=ollama_url,
+        name='nomic-embed-text',
+        options=None,  # type=o.llama.Options
+        keep_alive=None,
+        ollama_client_kwargs={
+            'verify': False  # Disable certificate verification (not recommended)
+        }
+    )
+    return embedding_model
+
+def create_table(db):
+    table_name = "vectordb"
+    schema_dict = pa.schema([
+        pa.field("text", pa.string()),
+        pa.field("id", pa.string()),
+        pa.field("description", pa.string()),
+        pa.field("vector", pa.list_(pa.float64(), 768))
+    ])
+
+    try:
+        table = db.open_table(table_name)
+    except ValueError as e:
+        if "Table '" in str(e) and "' was not found" in str(e):
+            print(f"Table '{table_name}' not found. Creating...")
+            # Convert dictionary schema to PyArrow schema
+            schema = pa.schema(schema_dict)
+            table = db.create_table(table_name, schema=schema, mode="overwrite")
+        else:
+            quit(f"A error occurred when opening table: {e}")
+    return table
+
+def is_git_directory(path="."):
+    #print(f"path: {path}")
+    return subprocess.call(['git', 'rev-parse', '--is-inside-work-tree'], cwd=path, stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0
+
+def load_documents(root=None, exclude=None):
+    documents = []
+
+    # Split exclude into patterns if provided
+    exclude_patterns = (exclude or "").split("|") if exclude else []
+    
+    # Check if root is None and set it to the current directory if not provided
+    if root is None:
+        root = os.getcwd()
+
+    # Iterate through directories and files
+    for root, dirs, files in os.walk(root):
+
+        # Skip files specified in exclude
+        files = [f for f in files if not any(glob.fnmatch.fnmatch(f"{root}/{f}", pattern) for pattern in exclude_patterns)]
+        
+        for file in files:
+            path = os.path.join(root, file)
+            try:
+                with open(path, "rb") as f:
+                    content_bytes = f.read()
+                    content_type, _ = mimetypes.guess_type(f.name)
+                    
+                    # Decode the content to UTF-8
+                    content_str = content_bytes.decode("utf-8", errors='ignore')
+
+                    # Explicitly treat application/json as text/plain
+                    if 'application/json' == content_type:
+                        content_type = "text/plain"
+                    
+                    # Fallback check if the guessed MIME type is None or not text
+                    if content_type is None or 'text' not in content_type:
+                        if not any(char in content_str for char in "\n\r\t\v\f "):
+                            continue
+                    
+                    description = ""
+                    #print(f"path: {f.name}; root: {root}; f.name: {f.name}; content_type: {content_type}; file: {file};")
+                    if is_git_directory(root):
+                        try:
+                            description = subprocess.check_output(["git", "show", "--no-patch", path], stderr=subprocess.DEVNULL).decode("utf-8").strip() or ""
+                        except subprocess.CalledProcessError as e:
+                            print(f"Error fetching git description for {path}: {e}")
+
+                    print(f"Documents found '{f.name}'.")
+                    doc_id = hashlib.sha256(f"{os.path.dirname(path)}{path}".encode()).hexdigest()
+                    documents.append({"text": content_str, "id": doc_id, "description": description})
+            except Exception as e:
+                print(f"Error reading file {path}: {e}")
+    return documents
+
+def load_git_data():
+    if not is_git_directory():
+        print("Current directory is not a Git repository.")
+        return []
+
+    log_entries = subprocess.check_output([
+        "git", "log", "--pretty=format:%h %s", "--no-merges"
+    ], stderr=subprocess.DEVNULL, text=True).strip().split("\n")
+    
+    git_documents = []
+    for entry in log_entries:
+        commit_hash, message = entry.split(maxsplit=1)
+        description = subprocess.check_output(["git", "show", "--no-patch", f"{commit_hash}"], stderr=subprocess.DEVNULL).decode("utf-8").strip() or ""
+        git_documents.append({"text": f"Commit {commit_hash}: {message}", "id": commit_hash, "description": description})
+    
+    return git_documents
+
+def generate_embeddings(documents, embedding_model):
+    print("Generating embeddings...")
+    for doc in documents:
+        text = doc["text"]
+        doc_id = doc["id"]
+        embedding = embedding_model.generate_embeddings([text])[0]
+        doc["vector"] = embedding
+    print("Done.")
+    return documents
+
+def upsert_documents(table, documents):
+    table.merge_insert("id").when_matched_update_all().when_not_matched_insert_all().execute(documents)
+    affected_rows = table.count_rows()
+    print(f"Inserted {affected_rows} documents.")
+
+def create_vector_index(table):
+    try:
+        print(f"Creating vector index")
+        table.create_index(metric="cosine", vector_column_name="vector")
+        print("Vector index created successfully.")
+    except Exception as e:
+        quit(f"Error creating vector index: {e}")
+
+def wait_for_index(table, index_name):
+    POLL_INTERVAL = 10
+    while True:
+        indices = table.list_indices()
+        if indices and any(index.name == index_name for index in indices):
+            break
+        print(f"Waiting for {index_name} to be ready...")
+        time.sleep(POLL_INTERVAL)
+    print(f"Vector index {index_name} is ready!")
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", default="config.ini", help="Path to config file")
+    parser.add_argument("--root", type=str, help="Root directory to process")
+    parser.add_argument("--exclude", type=str, help="Exclude patterns separated by '|'")
+    args, _ = parser.parse_known_args()
+
+    config = load_config(args)
+    db = setup_database(config)
+    embedding_model = setup_embedding_model(config)
+    table = create_table(db)
+
+    documents = load_documents(root=args.root, exclude=args.exclude)
+    git_documents = load_git_data()
+    documents.extend(git_documents)
+
+    documents = generate_embeddings(documents, embedding_model)
+    upsert_documents(table, documents)
+    create_vector_index(table)
+    wait_for_index(table, "vector_idx")
+    print("Documents inserted.")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file