nip/tests/test_deduplication.nim

131 lines
3.8 KiB
Nim

import unittest, os, strutils, tables, sets
import ../src/nimpak/cas
import ../src/nip/types
suite "Cross-Format Deduplication Metrics Tests":
var
cas: CasManager
testRoot = getTempDir() / "nip_dedup_test_" & $getCurrentProcessId()
# Test data
chunk1 = @[1.byte, 2.byte, 3.byte] # 3 bytes
chunk2 = @[4.byte, 5.byte, 6.byte] # 3 bytes
chunk3 = @[7.byte, 8.byte, 9.byte] # 3 bytes
hash1: string
hash2: string
hash3: string
setup:
createDir(testRoot)
cas = initCasManager(testRoot)
# Store chunks
let res1 = cas.storeObject(chunk1)
let res2 = cas.storeObject(chunk2)
let res3 = cas.storeObject(chunk3)
hash1 = res1.get().hash
hash2 = res2.get().hash
hash3 = res3.get().hash
teardown:
removeDir(testRoot)
test "Basic Deduplication Stats":
# Scenario:
# NPK uses chunk1, chunk2
# NIP uses chunk2, chunk3
# NEXTER uses chunk1, chunk3
# chunk1: NPK, NEXTER (Ref count 2)
# chunk2: NPK, NIP (Ref count 2)
# chunk3: NIP, NEXTER (Ref count 2)
discard cas.addReference(hash1, NPK, "pkg1")
discard cas.addReference(hash2, NPK, "pkg1")
discard cas.addReference(hash2, NIP, "pkg2")
discard cas.addReference(hash3, NIP, "pkg2")
discard cas.addReference(hash1, NEXTER, "pkg3")
discard cas.addReference(hash3, NEXTER, "pkg3")
let statsResult = cas.getDeduplicationStats()
check statsResult.isOk
let stats = statsResult.get()
# Physical size: 3 chunks * 3 bytes = 9 bytes
check stats.totalPhysicalSize == 9
# Logical size:
# pkg1: 3+3 = 6
# pkg2: 3+3 = 6
# pkg3: 3+3 = 6
# Total: 18 bytes
check stats.totalLogicalSize == 18
# Deduplication ratio: 18 / 9 = 2.0
check stats.deduplicationRatio == 2.0
# Shared chunks: All 3 are shared
check stats.sharedChunks == 3
# Savings: 18 - 9 = 9 bytes
check stats.savings == 9
# Format Overlap
# chunk1: NPK-NEXTER
# chunk2: NIP-NPK
# chunk3: NEXTER-NIP
check stats.formatOverlap.hasKey("NEXTER-NPK")
check stats.formatOverlap["NEXTER-NPK"] == 1
check stats.formatOverlap.hasKey("NIP-NPK")
check stats.formatOverlap["NIP-NPK"] == 1
check stats.formatOverlap.hasKey("NEXTER-NIP")
check stats.formatOverlap["NEXTER-NIP"] == 1
test "No Deduplication":
# Scenario: Unique chunks for each
discard cas.addReference(hash1, NPK, "pkg1")
discard cas.addReference(hash2, NIP, "pkg2")
discard cas.addReference(hash3, NEXTER, "pkg3")
let statsResult = cas.getDeduplicationStats()
check statsResult.isOk
let stats = statsResult.get()
check stats.totalPhysicalSize == 9
check stats.totalLogicalSize == 9
check stats.deduplicationRatio == 1.0
check stats.sharedChunks == 0
check stats.savings == 0
check stats.formatOverlap.len == 0
test "High Redundancy":
# Scenario: All formats use same chunk
discard cas.addReference(hash1, NPK, "pkg1")
discard cas.addReference(hash1, NIP, "pkg2")
discard cas.addReference(hash1, NEXTER, "pkg3")
let statsResult = cas.getDeduplicationStats()
check statsResult.isOk
let stats = statsResult.get()
# Physical: 3 bytes (only chunk1 counted, others ignored if not referenced? No, we only iterate referenced hashes)
# Wait, hash2 and hash3 exist in CAS but are NOT referenced.
# getDeduplicationStats iterates over `cas.formatRefs`.
# So unreferenced chunks are NOT included in stats.
check stats.totalPhysicalSize == 3
check stats.totalLogicalSize == 9 # 3 refs * 3 bytes
check stats.deduplicationRatio == 3.0
check stats.sharedChunks == 1
check stats.formatOverlap.hasKey("NEXTER-NIP-NPK")
check stats.formatOverlap["NEXTER-NIP-NPK"] == 1