code_rag/tests/test_doc_tracker.py
2025-03-21 10:09:07 -04:00

158 lines
4.9 KiB
Python

import os
import time
import pytest
from .fixtures import *
from code_rag.doc_tracker import DocMetaData, DocumentTracker, calculate_file_hash
def doc_infos_are_equal(left, right):
"""Check to see if two doc_infos are the same"""
for k, v in left.items():
try:
print(v.to_dict(), right[k].to_dict(), v.to_dict() == right[k].to_dict())
if v != right[k]:
return False
except KeyError:
return False
return True
@pytest.fixture
def document_tracker(tracker_file):
"""Create a DocumentTracker instance"""
return DocumentTracker(tracking_file=tracker_file)
# Tests for DocumentTracker
def test_init_new_tracker(tracker_file):
"""Test creating a new tracker"""
tracker = DocumentTracker(tracking_file=tracker_file)
assert tracker.doc_info == {}
assert not os.path.exists(tracker_file)
def test_save_and_load_tracking_data(document_tracker, tracker_file):
"""Test saving and loading tracking data"""
# Add some data
update_time = time.time()
document_tracker.doc_info = {
"test.txt": DocMetaData(123456, "abcdef", update_time, ["1", "2"])
}
document_tracker._save_tracking_data()
# Check file exists
assert os.path.exists(tracker_file)
# Create a new tracker that should load the data
new_tracker = DocumentTracker(tracking_file=tracker_file)
assert doc_infos_are_equal(
new_tracker.doc_info,
{"test.txt": DocMetaData(123456, "abcdef", update_time, ["1", "2"])},
)
def test_calculate_file_hash(document_tracker, sample_docs):
"""Test hash calculation for a file"""
file_path = sample_docs[0]
hash1 = calculate_file_hash(file_path)
# Same content should yield same hash
hash2 = calculate_file_hash(file_path)
assert hash1 == hash2
# Different content should yield different hash
with open(file_path, "a") as f:
f.write("Additional content")
hash3 = calculate_file_hash(file_path)
assert hash1 != hash3
def test_get_changed_files_new(document_tracker, docs_dir, sample_docs):
"""Test detecting new files"""
changes = document_tracker.get_changed_files(docs_dir)
assert set(changes["new"]) == set(sample_docs)
assert changes["modified"] == []
assert changes["deleted"] == []
# Verify tracking was updated
for file_path in sample_docs:
assert file_path in document_tracker.doc_info
assert document_tracker.doc_info[file_path].chunk_ids == []
def test_get_changed_files_modified(document_tracker, docs_dir, sample_docs):
"""Test detecting modified files"""
# First scan to establish tracking
document_tracker.get_changed_files(docs_dir)
# Modify a file and wait to ensure timestamp difference
time.sleep(0.1)
with open(sample_docs[0], "a") as f:
f.write("Modified content")
# Detect changes
changes = document_tracker.get_changed_files(docs_dir)
assert changes["new"] == []
assert changes["modified"] == [sample_docs[0]]
assert changes["deleted"] == []
def test_get_changed_files_deleted(document_tracker, docs_dir, sample_docs):
"""Test detecting deleted files"""
# First scan to establish tracking
document_tracker.get_changed_files(docs_dir)
# Delete a file
os.remove(sample_docs[0])
# Detect changes
changes = document_tracker.get_changed_files(docs_dir)
assert changes["new"] == []
assert changes["modified"] == []
assert changes["deleted"] == [sample_docs[0]]
def test_update_chunk_mappings(document_tracker, sample_docs):
"""Test updating chunk mappings"""
file_path = sample_docs[0]
chunk_ids = ["chunk1", "chunk2", "chunk3"]
# First make sure the file is tracked
document_tracker.doc_info[file_path] = DocMetaData(
123,
"abc",
"2023-01-01",
[],
)
# Update chunk mappings
document_tracker.update_chunk_mappings(file_path, chunk_ids)
assert document_tracker.doc_info[file_path].chunk_ids == chunk_ids
def test_get_chunks_to_delete(document_tracker):
"""Test getting chunks to delete for deleted files"""
# Setup tracking data
document_tracker.doc_info = {
"file1.txt": DocMetaData(0, "abc", 0, ["chunk1", "chunk2"]),
"file2.txt": DocMetaData(0, "abc", 0, ["chunk3", "chunk4"]),
"file3.txt": DocMetaData(0, "abc", 0, ["chunk5"]),
}
# Test with one deleted file
chunks = document_tracker.get_chunks_to_delete(["file1.txt"])
assert set(chunks) == {"chunk1", "chunk2"}
# Verify file was removed from tracking
assert "file1.txt" not in document_tracker.doc_info
# Test with multiple deleted files
chunks = document_tracker.get_chunks_to_delete(["file2.txt", "file3.txt"])
assert set(chunks) == {"chunk3", "chunk4", "chunk5"}
# Verify files were removed from tracking
assert "file2.txt" not in document_tracker.doc_info
assert "file3.txt" not in document_tracker.doc_info