Initial code commit
This commit is contained in:
parent
14c06787dc
commit
9842d55bbf
7 changed files with 441 additions and 32 deletions
89
README.md
89
README.md
|
@ -1,4 +1,3 @@
|
|||
Here’s a polished `README.md` for your project **CodeRecall**:
|
||||
|
||||
---
|
||||
|
||||
|
@ -26,10 +25,10 @@ No cloud APIs. No latency. Full control.
|
|||
|
||||
```
|
||||
CodeRecall/
|
||||
├── chroma_ingest.py # Ingest codebase + Git into ChromaDB
|
||||
├── chroma_context_provider.py # VS Code Continue context provider
|
||||
├── config.ini # Ollama + Chroma settings
|
||||
├── chroma-db/ # ChromaDB persistence directory
|
||||
├── lancedb_ingest.py # Ingest codebase + Git into ChromaDB
|
||||
├── lancedb_context_provider.py # VS Code Continue context provider
|
||||
├── config.ini.example # Ollama + LanceDB settings
|
||||
├── lancedb-data/ # LanceDB persistence directory
|
||||
└── config.json # Continue extension config
|
||||
```
|
||||
|
||||
|
@ -40,7 +39,7 @@ CodeRecall/
|
|||
### 1. Install dependencies
|
||||
|
||||
```bash
|
||||
pip install chromadb requests
|
||||
pip install lancedb
|
||||
```
|
||||
|
||||
Make sure you have:
|
||||
|
@ -51,11 +50,24 @@ Make sure you have:
|
|||
### 2. Configure `config.ini`
|
||||
|
||||
```ini
|
||||
[ollama]
|
||||
[[ollama]
|
||||
url = http://localhost:11434
|
||||
|
||||
[chroma]
|
||||
persist_directory = ./chroma-db
|
||||
[lancedb]
|
||||
persist_directory = ./lancedb-data
|
||||
|
||||
[s3]
|
||||
enable = True
|
||||
bucket_name = my-s3-bucket
|
||||
access_key_id = my-access-key
|
||||
secret_access_key = my-secret-key
|
||||
region = us-east-1
|
||||
# Optional, if using third party s3 providers
|
||||
endpoint = http://minio:9000
|
||||
|
||||
[server]
|
||||
host = 0.0.0.0
|
||||
port = 8080
|
||||
```
|
||||
|
||||
---
|
||||
|
@ -63,7 +75,7 @@ persist_directory = ./chroma-db
|
|||
## 📥 Ingest your project
|
||||
|
||||
```bash
|
||||
python chroma_ingest.py
|
||||
python lancedb_ingest.py
|
||||
```
|
||||
|
||||
This loads:
|
||||
|
@ -75,26 +87,41 @@ This loads:
|
|||
|
||||
## 🧠 Add as a VS Code Context Provider
|
||||
|
||||
### `config.json` for Continue
|
||||
### `config.yaml` for Continue
|
||||
|
||||
```json
|
||||
{
|
||||
"models": [
|
||||
{
|
||||
"title": "LLaMA 3 (Ollama)",
|
||||
"provider": "ollama",
|
||||
"model": "llama3",
|
||||
"apiBase": "http://localhost:11434"
|
||||
}
|
||||
],
|
||||
"contextProviders": [
|
||||
{
|
||||
"title": "ChromaDB Search",
|
||||
"provider": "custom",
|
||||
"path": "./chroma_context_provider.py"
|
||||
}
|
||||
]
|
||||
}
|
||||
```yaml
|
||||
name: Local Assistant
|
||||
version: 1.0.0
|
||||
schema: v1
|
||||
models:
|
||||
- name: Ollama Autodetect
|
||||
provider: ollama
|
||||
model: AUTODETECT
|
||||
apiBase: http://localhost:11434
|
||||
- name: Ollama Autocomplete
|
||||
provider: ollama
|
||||
model: qwen2.5-coder:1.5b-base
|
||||
apiBase: http://localhost:11434
|
||||
roles:
|
||||
- autocomplete
|
||||
- name: Nomic Embed Text
|
||||
provider: ollama
|
||||
model: nomic-embed-text
|
||||
apiBase: http://localhost:11434
|
||||
roles:
|
||||
- embed
|
||||
context:
|
||||
- provider: code
|
||||
- provider: docs
|
||||
- provider: diff
|
||||
- provider: terminal
|
||||
- provider: problems
|
||||
- provider: folder
|
||||
- provider: codebase
|
||||
# LanceDB Context Provider
|
||||
- provider: http
|
||||
params:
|
||||
url: http://localhost/retrieve
|
||||
```
|
||||
|
||||
---
|
||||
|
@ -103,7 +130,7 @@ This loads:
|
|||
|
||||
1. Launch VS Code.
|
||||
2. Open the Continue sidebar.
|
||||
3. Set `"ChromaDB Search"` as your context provider.
|
||||
3. Set `"@HTTP"` as your context provider.
|
||||
4. Ask your model questions about your codebase, architecture, or commits.
|
||||
|
||||
Example prompt:
|
||||
|
@ -114,7 +141,7 @@ Example prompt:
|
|||
## 📌 Notes
|
||||
|
||||
- Default embedding model is `nomic-embed-text` (via Ollama).
|
||||
- Change `n_results` in `chroma_context_provider.py` for broader/narrower context.
|
||||
- Change `n_results` in `lancedb_context_provider.py` for broader/narrower context.
|
||||
- Works offline, no API keys required.
|
||||
|
||||
---
|
||||
|
|
19
config.ini
Normal file
19
config.ini
Normal file
|
@ -0,0 +1,19 @@
|
|||
[ollama]
|
||||
url = https://ollama-dev-lxc.int.lhprojects.net
|
||||
|
||||
[server]
|
||||
host = 127.0.0.1
|
||||
port = 8080
|
||||
|
||||
[lancedb]
|
||||
persist_directory = ./lancedb
|
||||
|
||||
[s3]
|
||||
bucket_name = lancedb
|
||||
access_key_id = qW8S1xub7zhxHtItT9WG
|
||||
secret_access_key = HEqQuNJXn7uR17h8T4QvmUbwdswe9PeKJy7f5wIp
|
||||
region = us-east-1
|
||||
enable = True
|
||||
# Optional, only set if s3_type is 'other'
|
||||
endpoint = https://s3.minidrive.cloud
|
||||
|
18
config.ini.example
Normal file
18
config.ini.example
Normal file
|
@ -0,0 +1,18 @@
|
|||
[ollama]
|
||||
url = http://localhost:11434
|
||||
|
||||
[lancedb]
|
||||
persist_directory = ./lancedb-data
|
||||
|
||||
[s3]
|
||||
enable = True
|
||||
bucket_name = my-s3-bucket
|
||||
access_key_id = my-access-key
|
||||
secret_access_key = my-secret-key
|
||||
region = us-east-1
|
||||
# Optional, if using third party s3 providers
|
||||
endpoint = http://minio:9000
|
||||
|
||||
[server]
|
||||
host = 0.0.0.0
|
||||
port = 8080
|
32
config.yaml.example
Normal file
32
config.yaml.example
Normal file
|
@ -0,0 +1,32 @@
|
|||
name: Local Assistant
|
||||
version: 1.0.0
|
||||
schema: v1
|
||||
models:
|
||||
- name: Ollama Autodetect
|
||||
provider: ollama
|
||||
model: AUTODETECT
|
||||
apiBase: http://localhost:11434
|
||||
- name: Ollama Autocomplete
|
||||
provider: ollama
|
||||
model: qwen2.5-coder:1.5b-base
|
||||
apiBase: http://localhost:11434
|
||||
roles:
|
||||
- autocomplete
|
||||
- name: Nomic Embed Text
|
||||
provider: ollama
|
||||
model: nomic-embed-text
|
||||
apiBase: http://localhost:11434
|
||||
roles:
|
||||
- embed
|
||||
context:
|
||||
- provider: code
|
||||
- provider: docs
|
||||
- provider: diff
|
||||
- provider: terminal
|
||||
- provider: problems
|
||||
- provider: folder
|
||||
- provider: codebase
|
||||
# LanceDB Context Provider
|
||||
- provider: http
|
||||
params:
|
||||
url: http://localhost/retrieve
|
15
lancedb-context.service
Normal file
15
lancedb-context.service
Normal file
|
@ -0,0 +1,15 @@
|
|||
[Unit]
|
||||
Description=LanceDB Context Provider API for Continue
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=your-username
|
||||
WorkingDirectory=/path/to/your/project
|
||||
ExecStart=/path/to/your/project/lancedb_context_provider.py --config $CONFIG_PATH
|
||||
EnvironmentFile=-/etc/default/lancedb_context_provider
|
||||
Restart=on-failure
|
||||
RestartSec=5
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
81
lancedb_context_provider.py
Executable file
81
lancedb_context_provider.py
Executable file
|
@ -0,0 +1,81 @@
|
|||
#!/usr/bin/env python3
|
||||
from fastapi import FastAPI, Depends
|
||||
from pydantic import BaseModel
|
||||
import lancedb
|
||||
from lancedb.embeddings.ollama import OllamaEmbeddings
|
||||
import configparser
|
||||
import argparse
|
||||
import os
|
||||
import uvicorn
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
def load_config(args):
|
||||
config = configparser.ConfigParser()
|
||||
config.read(args.config)
|
||||
return config
|
||||
|
||||
def setup_database(config):
|
||||
s3_bucket = config.get("s3", "bucket_name", fallback=None)
|
||||
enable_s3 = config.get("s3", "enable", fallback="false").lower() in ['true', 'yes', '1', 'on']
|
||||
persist_directory = config.get("lancedb", "persist_directory", fallback="./lancedb")
|
||||
|
||||
if s3_bucket and enable_s3:
|
||||
storage_options = {
|
||||
"aws_access_key_id": config.get("s3", "access_key_id", fallback=None),
|
||||
"aws_secret_access_key": config.get("s3", "secret_access_key", fallback=None),
|
||||
"region": config.get("s3", "region", fallback="us-east-1"),
|
||||
"endpoint_url": config.get("s3", "endpoint", fallback=None),
|
||||
}
|
||||
db = lancedb.connect(
|
||||
f"s3://{s3_bucket}",
|
||||
storage_options=storage_options
|
||||
)
|
||||
else:
|
||||
db = lancedb.connect(persist_directory)
|
||||
|
||||
return db
|
||||
|
||||
class ContextProviderInput(BaseModel):
|
||||
query: str
|
||||
fullInput: str
|
||||
|
||||
@app.post("/retrieve")
|
||||
async def retrieve_context(input: ContextProviderInput, embedding_function: OllamaEmbeddings = Depends(lambda: ollama_ef)):
|
||||
# Generate embedding from the query
|
||||
query_embedding = embedding_function.generate_embeddings([input.query])[0]
|
||||
|
||||
# Search for similar documents
|
||||
results = table.search(query_embedding).distance_type("cosine").limit(3).to_list()
|
||||
|
||||
# Create a list of context items
|
||||
context_items = []
|
||||
for result in results:
|
||||
context_items.append({
|
||||
"name": result.get("id", "unknown"),
|
||||
"description": result.get("description", "document"),
|
||||
"content": result["text"]
|
||||
})
|
||||
return context_items
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Parse command line arguments
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--config", default="config.ini", help="Path to config file")
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
# Load configuration and create database connection
|
||||
config = load_config(args)
|
||||
db = setup_database(config)
|
||||
|
||||
# Open table
|
||||
table = db.open_table("vectordb")
|
||||
|
||||
# Initialize Ollama embedding function
|
||||
ollama_url = config.get("ollama", "url")
|
||||
ollama_ef = OllamaEmbeddings(model="nomic-embed-text", url=ollama_url)
|
||||
|
||||
# Run the application
|
||||
host = config.get("server", "host")
|
||||
port = int(config.get("server", "port"))
|
||||
uvicorn.run(app, host=host, port=port)
|
217
lancedb_ingest.py
Normal file
217
lancedb_ingest.py
Normal file
|
@ -0,0 +1,217 @@
|
|||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import subprocess
|
||||
import os
|
||||
import json
|
||||
import pyarrow as pa
|
||||
import time
|
||||
import hashlib
|
||||
import signal
|
||||
import configparser
|
||||
import lancedb
|
||||
import mimetypes
|
||||
import glob
|
||||
from lancedb.embeddings.ollama import OllamaEmbeddings
|
||||
from functools import wraps
|
||||
|
||||
# Ignore warnings
|
||||
os.environ['RUST_LOG'] = 'error'
|
||||
|
||||
def handle_signals(signame):
|
||||
def signal_handler(sig, frame):
|
||||
print(f"\nReceived {signame} signal. Exiting...")
|
||||
quit(0)
|
||||
return signal_handler
|
||||
|
||||
# Register signal handlers
|
||||
signal.signal(signal.SIGINT, handle_signals("SIGINT"))
|
||||
signal.signal(signal.SIGTERM, handle_signals("SIGTERM"))
|
||||
|
||||
def load_config(args):
|
||||
config = configparser.ConfigParser()
|
||||
config.read(args.config)
|
||||
return config
|
||||
|
||||
def setup_database(config):
|
||||
s3_bucket = config.get("s3", "bucket_name", fallback=None)
|
||||
enable_s3 = config.get("s3", "enable", fallback="false").lower() in ['true', 'yes', '1', 'on']
|
||||
persist_directory = config.get("lancedb", "persist_directory", fallback="./lancedb")
|
||||
|
||||
if s3_bucket and enable_s3:
|
||||
storage_options = {
|
||||
"aws_access_key_id": config.get("s3", "access_key_id", fallback=None),
|
||||
"aws_secret_access_key": config.get("s3", "secret_access_key", fallback=None),
|
||||
"region": config.get("s3", "region", fallback="us-east-1"),
|
||||
"endpoint_url": config.get("s3", "endpoint", fallback=None),
|
||||
}
|
||||
db = lancedb.connect(
|
||||
f"s3://{s3_bucket}",
|
||||
storage_options=storage_options
|
||||
)
|
||||
else:
|
||||
db = lancedb.connect(persist_directory)
|
||||
|
||||
return db
|
||||
|
||||
def setup_embedding_model(config):
|
||||
ollama_url = config.get("ollama", "url", fallback="http://localhost:11434")
|
||||
embedding_model = OllamaEmbeddings(
|
||||
host=ollama_url,
|
||||
name='nomic-embed-text',
|
||||
options=None, # type=o.llama.Options
|
||||
keep_alive=None,
|
||||
ollama_client_kwargs={
|
||||
'verify': False # Disable certificate verification (not recommended)
|
||||
}
|
||||
)
|
||||
return embedding_model
|
||||
|
||||
def create_table(db):
|
||||
table_name = "vectordb"
|
||||
schema_dict = pa.schema([
|
||||
pa.field("text", pa.string()),
|
||||
pa.field("id", pa.string()),
|
||||
pa.field("description", pa.string()),
|
||||
pa.field("vector", pa.list_(pa.float64(), 768))
|
||||
])
|
||||
|
||||
try:
|
||||
table = db.open_table(table_name)
|
||||
except ValueError as e:
|
||||
if "Table '" in str(e) and "' was not found" in str(e):
|
||||
print(f"Table '{table_name}' not found. Creating...")
|
||||
# Convert dictionary schema to PyArrow schema
|
||||
schema = pa.schema(schema_dict)
|
||||
table = db.create_table(table_name, schema=schema, mode="overwrite")
|
||||
else:
|
||||
quit(f"A error occurred when opening table: {e}")
|
||||
return table
|
||||
|
||||
def is_git_directory(path="."):
|
||||
#print(f"path: {path}")
|
||||
return subprocess.call(['git', 'rev-parse', '--is-inside-work-tree'], cwd=path, stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0
|
||||
|
||||
def load_documents(root=None, exclude=None):
|
||||
documents = []
|
||||
|
||||
# Split exclude into patterns if provided
|
||||
exclude_patterns = (exclude or "").split("|") if exclude else []
|
||||
|
||||
# Check if root is None and set it to the current directory if not provided
|
||||
if root is None:
|
||||
root = os.getcwd()
|
||||
|
||||
# Iterate through directories and files
|
||||
for root, dirs, files in os.walk(root):
|
||||
|
||||
# Skip files specified in exclude
|
||||
files = [f for f in files if not any(glob.fnmatch.fnmatch(f"{root}/{f}", pattern) for pattern in exclude_patterns)]
|
||||
|
||||
for file in files:
|
||||
path = os.path.join(root, file)
|
||||
try:
|
||||
with open(path, "rb") as f:
|
||||
content_bytes = f.read()
|
||||
content_type, _ = mimetypes.guess_type(f.name)
|
||||
|
||||
# Decode the content to UTF-8
|
||||
content_str = content_bytes.decode("utf-8", errors='ignore')
|
||||
|
||||
# Explicitly treat application/json as text/plain
|
||||
if 'application/json' == content_type:
|
||||
content_type = "text/plain"
|
||||
|
||||
# Fallback check if the guessed MIME type is None or not text
|
||||
if content_type is None or 'text' not in content_type:
|
||||
if not any(char in content_str for char in "\n\r\t\v\f "):
|
||||
continue
|
||||
|
||||
description = ""
|
||||
#print(f"path: {f.name}; root: {root}; f.name: {f.name}; content_type: {content_type}; file: {file};")
|
||||
if is_git_directory(root):
|
||||
try:
|
||||
description = subprocess.check_output(["git", "show", "--no-patch", path], stderr=subprocess.DEVNULL).decode("utf-8").strip() or ""
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error fetching git description for {path}: {e}")
|
||||
|
||||
print(f"Documents found '{f.name}'.")
|
||||
doc_id = hashlib.sha256(f"{os.path.dirname(path)}{path}".encode()).hexdigest()
|
||||
documents.append({"text": content_str, "id": doc_id, "description": description})
|
||||
except Exception as e:
|
||||
print(f"Error reading file {path}: {e}")
|
||||
return documents
|
||||
|
||||
def load_git_data():
|
||||
if not is_git_directory():
|
||||
print("Current directory is not a Git repository.")
|
||||
return []
|
||||
|
||||
log_entries = subprocess.check_output([
|
||||
"git", "log", "--pretty=format:%h %s", "--no-merges"
|
||||
], stderr=subprocess.DEVNULL, text=True).strip().split("\n")
|
||||
|
||||
git_documents = []
|
||||
for entry in log_entries:
|
||||
commit_hash, message = entry.split(maxsplit=1)
|
||||
description = subprocess.check_output(["git", "show", "--no-patch", f"{commit_hash}"], stderr=subprocess.DEVNULL).decode("utf-8").strip() or ""
|
||||
git_documents.append({"text": f"Commit {commit_hash}: {message}", "id": commit_hash, "description": description})
|
||||
|
||||
return git_documents
|
||||
|
||||
def generate_embeddings(documents, embedding_model):
|
||||
print("Generating embeddings...")
|
||||
for doc in documents:
|
||||
text = doc["text"]
|
||||
doc_id = doc["id"]
|
||||
embedding = embedding_model.generate_embeddings([text])[0]
|
||||
doc["vector"] = embedding
|
||||
print("Done.")
|
||||
return documents
|
||||
|
||||
def upsert_documents(table, documents):
|
||||
table.merge_insert("id").when_matched_update_all().when_not_matched_insert_all().execute(documents)
|
||||
affected_rows = table.count_rows()
|
||||
print(f"Inserted {affected_rows} documents.")
|
||||
|
||||
def create_vector_index(table):
|
||||
try:
|
||||
print(f"Creating vector index")
|
||||
table.create_index(metric="cosine", vector_column_name="vector")
|
||||
print("Vector index created successfully.")
|
||||
except Exception as e:
|
||||
quit(f"Error creating vector index: {e}")
|
||||
|
||||
def wait_for_index(table, index_name):
|
||||
POLL_INTERVAL = 10
|
||||
while True:
|
||||
indices = table.list_indices()
|
||||
if indices and any(index.name == index_name for index in indices):
|
||||
break
|
||||
print(f"Waiting for {index_name} to be ready...")
|
||||
time.sleep(POLL_INTERVAL)
|
||||
print(f"Vector index {index_name} is ready!")
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--config", default="config.ini", help="Path to config file")
|
||||
parser.add_argument("--root", type=str, help="Root directory to process")
|
||||
parser.add_argument("--exclude", type=str, help="Exclude patterns separated by '|'")
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
config = load_config(args)
|
||||
db = setup_database(config)
|
||||
embedding_model = setup_embedding_model(config)
|
||||
table = create_table(db)
|
||||
|
||||
documents = load_documents(root=args.root, exclude=args.exclude)
|
||||
git_documents = load_git_data()
|
||||
documents.extend(git_documents)
|
||||
|
||||
documents = generate_embeddings(documents, embedding_model)
|
||||
upsert_documents(table, documents)
|
||||
create_vector_index(table)
|
||||
wait_for_index(table, "vector_idx")
|
||||
print("Documents inserted.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Add table
Reference in a new issue