phase a audit
This commit is contained in:
@@ -23,13 +23,9 @@ final class ChunkManager
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// COUNT (für Guardrails / Limits)
|
||||
// COUNT (Streaming, robust)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Zählt Datensätze (NDJSON-Zeilen) im index.ndjson streaming-basiert.
|
||||
* Leere / kaputte Zeilen werden ignoriert.
|
||||
*/
|
||||
public function countAllChunks(): int
|
||||
{
|
||||
if (!is_file($this->indexPath)) {
|
||||
@@ -42,6 +38,7 @@ final class ChunkManager
|
||||
}
|
||||
|
||||
$count = 0;
|
||||
|
||||
try {
|
||||
while (($line = fgets($handle)) !== false) {
|
||||
$line = trim($line);
|
||||
@@ -49,7 +46,6 @@ final class ChunkManager
|
||||
continue;
|
||||
}
|
||||
|
||||
// NDJSON besteht aus JSON-Objekten; wir zählen nur valide Arrays.
|
||||
$data = json_decode($line, true);
|
||||
if (is_array($data)) {
|
||||
$count++;
|
||||
@@ -63,7 +59,7 @@ final class ChunkManager
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// APPEND
|
||||
// APPEND (Streaming + Exception Safe)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
@@ -82,27 +78,34 @@ final class ChunkManager
|
||||
throw new \RuntimeException('Unable to open index.ndjson for append');
|
||||
}
|
||||
|
||||
foreach ($records as $record) {
|
||||
$json = json_encode($record, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
|
||||
if ($json === false) {
|
||||
fclose($handle);
|
||||
throw new \RuntimeException('Unable to encode chunk record');
|
||||
try {
|
||||
foreach ($records as $record) {
|
||||
$json = json_encode(
|
||||
$record,
|
||||
JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES
|
||||
);
|
||||
|
||||
if ($json === false) {
|
||||
throw new \RuntimeException('Unable to encode chunk record');
|
||||
}
|
||||
|
||||
if (fwrite($handle, $json . PHP_EOL) === false) {
|
||||
throw new \RuntimeException('Unable to write chunk to index');
|
||||
}
|
||||
}
|
||||
|
||||
fwrite($handle, $json . PHP_EOL);
|
||||
} finally {
|
||||
fclose($handle);
|
||||
}
|
||||
|
||||
fclose($handle);
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// COMPACTION – Entfernt alle Chunks eines Dokuments
|
||||
// COMPACTION (Streaming + Safe Handles)
|
||||
// ============================================================
|
||||
|
||||
public function compactByDocument(Uuid $documentId): void
|
||||
{
|
||||
if (!is_file($this->indexPath)) {
|
||||
return; // nichts zu kompaktieren
|
||||
return;
|
||||
}
|
||||
|
||||
$tmpPath = $this->indexPath . '.tmp';
|
||||
@@ -116,32 +119,36 @@ final class ChunkManager
|
||||
|
||||
$docIdString = $documentId->toRfc4122();
|
||||
|
||||
while (($line = fgets($in)) !== false) {
|
||||
$line = trim($line);
|
||||
if ($line === '') {
|
||||
continue;
|
||||
}
|
||||
try {
|
||||
while (($line = fgets($in)) !== false) {
|
||||
$line = trim($line);
|
||||
if ($line === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$data = json_decode($line, true);
|
||||
if (!is_array($data)) {
|
||||
continue; // skip corrupted line
|
||||
}
|
||||
$data = json_decode($line, true);
|
||||
if (!is_array($data)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (($data['document_id'] ?? null) === $docIdString) {
|
||||
continue; // skip this document's chunks
|
||||
}
|
||||
if (($data['document_id'] ?? null) === $docIdString) {
|
||||
continue;
|
||||
}
|
||||
|
||||
fwrite($out, $line . PHP_EOL);
|
||||
if (fwrite($out, $line . PHP_EOL) === false) {
|
||||
throw new \RuntimeException('Unable to write compacted chunk');
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
fclose($in);
|
||||
fclose($out);
|
||||
}
|
||||
|
||||
fclose($in);
|
||||
fclose($out);
|
||||
|
||||
$this->atomicSwitch($tmpPath, $this->indexPath);
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// FULL REWRITE (Global Reindex)
|
||||
// FULL REWRITE (Streaming + Atomic)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
@@ -162,23 +169,30 @@ final class ChunkManager
|
||||
throw new \RuntimeException('Unable to open temp index file');
|
||||
}
|
||||
|
||||
foreach ($records as $record) {
|
||||
$json = json_encode($record, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
|
||||
if ($json === false) {
|
||||
fclose($handle);
|
||||
throw new \RuntimeException('Unable to encode chunk record');
|
||||
try {
|
||||
foreach ($records as $record) {
|
||||
$json = json_encode(
|
||||
$record,
|
||||
JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES
|
||||
);
|
||||
|
||||
if ($json === false) {
|
||||
throw new \RuntimeException('Unable to encode chunk record');
|
||||
}
|
||||
|
||||
if (fwrite($handle, $json . PHP_EOL) === false) {
|
||||
throw new \RuntimeException('Unable to write chunk during rewrite');
|
||||
}
|
||||
}
|
||||
|
||||
fwrite($handle, $json . PHP_EOL);
|
||||
} finally {
|
||||
fclose($handle);
|
||||
}
|
||||
|
||||
fclose($handle);
|
||||
|
||||
$this->atomicSwitch($tmpPath, $this->indexPath);
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// STREAM READ (für FAISS rebuild)
|
||||
// STREAM READ (FAISS rebuild safe)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
@@ -223,4 +237,4 @@ final class ChunkManager
|
||||
throw new \RuntimeException('Atomic switch failed for index.ndjson');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user