mirror of
https://github.com/tensorchord/pgvecto.rs.git
synced 2025-04-18 21:44:00 +03:00
feat: increment build (#508)
Signed-off-by: usamoi <usamoi@outlook.com>
This commit is contained in:
parent
4360cac6f8
commit
64e81961e7
@ -1,4 +1,2 @@
|
||||
[advisories]
|
||||
ignore = [
|
||||
"RUSTSEC-2021-0127", # serde_cbor is unmaintained / serde_cbor is not used
|
||||
]
|
||||
ignore = ["RUSTSEC-2021-0127", "RUSTSEC-2021-0145"]
|
||||
|
8
.github/workflows/rust.yml
vendored
8
.github/workflows/rust.yml
vendored
@ -167,10 +167,10 @@ jobs:
|
||||
- name: Test (x86_64)
|
||||
if: matrix.arch == 'x86_64'
|
||||
run: |
|
||||
ASSETS=$(mktemp -d)
|
||||
wget https://downloadmirror.intel.com/813591/sde-external-9.33.0-2024-01-07-lin.tar.xz -O $ASSETS/sde-external.tar.xz
|
||||
tar -xf $ASSETS/sde-external.tar.xz -C $ASSETS
|
||||
cargo --config "target.x86_64-unknown-linux-gnu.runner = [\"$ASSETS/sde-external-9.33.0-2024-01-07-lin/sde64\", \"-spr\", \"--\"]" test "_v4" --all --no-fail-fast --features "pg$VERSION" --target $ARCH-unknown-linux-gnu -- --nocapture
|
||||
ASSETS=$(mktemp -d)
|
||||
wget https://downloadmirror.intel.com/813591/sde-external-9.33.0-2024-01-07-lin.tar.xz -O $ASSETS/sde-external.tar.xz
|
||||
tar -xf $ASSETS/sde-external.tar.xz -C $ASSETS
|
||||
cargo --config "target.x86_64-unknown-linux-gnu.runner = [\"$ASSETS/sde-external-9.33.0-2024-01-07-lin/sde64\", \"-spr\", \"--\"]" test "_v4" --all --no-fail-fast --features "pg$VERSION" --target $ARCH-unknown-linux-gnu -- --nocapture
|
||||
- name: Post Set up Cache
|
||||
uses: actions/cache/save@v4
|
||||
if: ${{ !steps.cache.outputs.cache-hit }}
|
||||
|
141
Cargo.lock
generated
141
Cargo.lock
generated
@ -220,7 +220,7 @@ dependencies = [
|
||||
"futures-io",
|
||||
"futures-lite 2.3.0",
|
||||
"parking",
|
||||
"polling 3.7.1",
|
||||
"polling 3.7.2",
|
||||
"rustix 0.38.34",
|
||||
"slab",
|
||||
"tracing",
|
||||
@ -275,9 +275,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "async-signal"
|
||||
version = "0.2.7"
|
||||
version = "0.2.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "329972aa325176e89114919f2a80fdae4f4c040f66a370b1a1159c6c0f94e7aa"
|
||||
checksum = "794f185324c2f00e771cd9f1ae8b5ac68be2ca7abb129a87afd6e86d228bc54d"
|
||||
dependencies = [
|
||||
"async-io 2.3.3",
|
||||
"async-lock 3.4.0",
|
||||
@ -371,9 +371,9 @@ checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0"
|
||||
|
||||
[[package]]
|
||||
name = "backtrace"
|
||||
version = "0.3.72"
|
||||
version = "0.3.73"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "17c6a35df3749d2e8bb1b7b21a976d82b15548788d2735b9d82f329268f71a11"
|
||||
checksum = "5cc23269a4f8976d0a4d2e7109211a419fe30e8d88d677cd60b6bc79c5732e0a"
|
||||
dependencies = [
|
||||
"addr2line",
|
||||
"cc",
|
||||
@ -389,7 +389,6 @@ name = "base"
|
||||
version = "0.0.0"
|
||||
dependencies = [
|
||||
"base_macros",
|
||||
"bytemuck",
|
||||
"c",
|
||||
"detect",
|
||||
"half 2.4.1",
|
||||
@ -516,26 +515,6 @@ version = "3.16.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
|
||||
|
||||
[[package]]
|
||||
name = "bytemuck"
|
||||
version = "1.16.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78834c15cb5d5efe3452d58b1e8ba890dd62d21907f867f383358198e56ebca5"
|
||||
dependencies = [
|
||||
"bytemuck_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bytemuck_derive"
|
||||
version = "1.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1ee891b04274a59bd38b412188e24b849617b2e45a0fd8d057deb63e7403761b"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.66",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
version = "1.5.0"
|
||||
@ -570,9 +549,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.0.98"
|
||||
version = "1.0.99"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "41c270e7540d725e65ac7f1b212ac8ce349719624d7bcff99f8e2e488e8cf03f"
|
||||
checksum = "96c51067fd44124faa7f870b4b1c969379ad32b2ba805aa959430ceaa384f695"
|
||||
|
||||
[[package]]
|
||||
name = "cee-scape"
|
||||
@ -634,9 +613,10 @@ checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422"
|
||||
name = "common"
|
||||
version = "0.0.0"
|
||||
dependencies = [
|
||||
"bytemuck",
|
||||
"base",
|
||||
"log",
|
||||
"memmap2",
|
||||
"rand",
|
||||
"rustix 0.38.34",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@ -871,11 +851,10 @@ name = "elkan_k_means"
|
||||
version = "0.0.0"
|
||||
dependencies = [
|
||||
"base",
|
||||
"bytemuck",
|
||||
"common",
|
||||
"num-traits",
|
||||
"rand",
|
||||
"rayon 0.0.0",
|
||||
"stoppable_rayon",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -1042,7 +1021,7 @@ dependencies = [
|
||||
"base",
|
||||
"common",
|
||||
"quantization",
|
||||
"rayon 0.0.0",
|
||||
"stoppable_rayon",
|
||||
"storage",
|
||||
]
|
||||
|
||||
@ -1190,6 +1169,17 @@ dependencies = [
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "graph"
|
||||
version = "0.0.0"
|
||||
dependencies = [
|
||||
"base",
|
||||
"common",
|
||||
"parking_lot",
|
||||
"rand",
|
||||
"stoppable_rayon",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "h2"
|
||||
version = "0.3.26"
|
||||
@ -1221,7 +1211,6 @@ version = "2.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888"
|
||||
dependencies = [
|
||||
"bytemuck",
|
||||
"cfg-if",
|
||||
"crunchy",
|
||||
"num-traits",
|
||||
@ -1270,16 +1259,24 @@ version = "0.3.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024"
|
||||
|
||||
[[package]]
|
||||
name = "hermit-abi"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fbf6a919d6cf397374f7dfeeea91d974c7c0a7221d0d0f4f20d859d329e53fcc"
|
||||
|
||||
[[package]]
|
||||
name = "hnsw"
|
||||
version = "0.0.0"
|
||||
dependencies = [
|
||||
"base",
|
||||
"bytemuck",
|
||||
"common",
|
||||
"graph",
|
||||
"num-traits",
|
||||
"parking_lot",
|
||||
"quantization",
|
||||
"rayon 0.0.0",
|
||||
"serde_json",
|
||||
"stoppable_rayon",
|
||||
"storage",
|
||||
]
|
||||
|
||||
@ -1316,9 +1313,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "httparse"
|
||||
version = "1.8.0"
|
||||
version = "1.9.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904"
|
||||
checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9"
|
||||
|
||||
[[package]]
|
||||
name = "httpdate"
|
||||
@ -1463,9 +1460,9 @@ dependencies = [
|
||||
"parking_lot",
|
||||
"quantization",
|
||||
"rand",
|
||||
"rayon 0.0.0",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"stoppable_rayon",
|
||||
"storage",
|
||||
"thiserror",
|
||||
"uuid",
|
||||
@ -1562,8 +1559,8 @@ dependencies = [
|
||||
"num-traits",
|
||||
"quantization",
|
||||
"rand",
|
||||
"rayon 0.0.0",
|
||||
"serde_json",
|
||||
"stoppable_rayon",
|
||||
"storage",
|
||||
]
|
||||
|
||||
@ -1699,9 +1696,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.7.2"
|
||||
version = "2.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d"
|
||||
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
|
||||
|
||||
[[package]]
|
||||
name = "memfd"
|
||||
@ -1735,9 +1732,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
|
||||
|
||||
[[package]]
|
||||
name = "miniz_oxide"
|
||||
version = "0.7.3"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "87dfd01fe195c66b572b37921ad8803d010623c0aca821bea2302239d155cdae"
|
||||
checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08"
|
||||
dependencies = [
|
||||
"adler",
|
||||
]
|
||||
@ -1791,9 +1788,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "object"
|
||||
version = "0.35.0"
|
||||
version = "0.36.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b8ec7ab813848ba4522158d5517a6093db1ded27575b070f4177b8d12b41db5e"
|
||||
checksum = "576dfe1fc8f9df304abb159d767a29d0476f7750fbf8aa7ad07816004a207434"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
@ -2030,13 +2027,13 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "polling"
|
||||
version = "3.7.1"
|
||||
version = "3.7.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5e6a007746f34ed64099e88783b0ae369eaa3da6392868ba262e2af9b8fbaea1"
|
||||
checksum = "a3ed00ed3fbf728b5816498ecd316d1716eecaced9c0c8d2c5a6740ca214985b"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"concurrent-queue",
|
||||
"hermit-abi 0.3.9",
|
||||
"hermit-abi 0.4.0",
|
||||
"pin-project-lite",
|
||||
"rustix 0.38.34",
|
||||
"tracing",
|
||||
@ -2098,6 +2095,7 @@ dependencies = [
|
||||
"elkan_k_means",
|
||||
"num-traits",
|
||||
"rand",
|
||||
"serde",
|
||||
"serde_json",
|
||||
]
|
||||
|
||||
@ -2156,14 +2154,6 @@ dependencies = [
|
||||
"rand",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon"
|
||||
version = "0.0.0"
|
||||
dependencies = [
|
||||
"log",
|
||||
"rayon 1.10.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon"
|
||||
version = "1.10.0"
|
||||
@ -2186,9 +2176,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "redox_syscall"
|
||||
version = "0.5.1"
|
||||
version = "0.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "469052894dcb553421e483e4209ee581a45100d31b4018de03e5a7ad86374a7e"
|
||||
checksum = "c82cf8cff14456045f55ec4241383baeff27af886adb72ffb2162f99911de0fd"
|
||||
dependencies = [
|
||||
"bitflags 2.5.0",
|
||||
]
|
||||
@ -2206,9 +2196,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.10.4"
|
||||
version = "1.10.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c"
|
||||
checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
@ -2218,9 +2208,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.4.6"
|
||||
version = "0.4.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea"
|
||||
checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
@ -2229,9 +2219,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.8.3"
|
||||
version = "0.8.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "adad44e29e4c806119491a7f06f03de4d1af22c3a680dd47f1e6e179439d1f56"
|
||||
checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b"
|
||||
|
||||
[[package]]
|
||||
name = "reqwest"
|
||||
@ -2604,12 +2594,22 @@ version = "1.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
|
||||
|
||||
[[package]]
|
||||
name = "stoppable_rayon"
|
||||
version = "0.0.0"
|
||||
dependencies = [
|
||||
"log",
|
||||
"rayon",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "storage"
|
||||
version = "0.0.0"
|
||||
dependencies = [
|
||||
"base",
|
||||
"common",
|
||||
"serde",
|
||||
"serde_json",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -2950,9 +2950,9 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
|
||||
|
||||
[[package]]
|
||||
name = "url"
|
||||
version = "2.5.0"
|
||||
version = "2.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633"
|
||||
checksum = "22784dbdf76fdde8af1aeda5622b546b422b6fc585325248a2bf9f5e41e94d6c"
|
||||
dependencies = [
|
||||
"form_urlencoded",
|
||||
"idna",
|
||||
@ -2961,9 +2961,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "utf8parse"
|
||||
version = "0.2.1"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
|
||||
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
|
||||
|
||||
[[package]]
|
||||
name = "uuid"
|
||||
@ -3018,7 +3018,6 @@ dependencies = [
|
||||
"arrayvec",
|
||||
"base",
|
||||
"bincode",
|
||||
"bytemuck",
|
||||
"byteorder",
|
||||
"chrono",
|
||||
"csv",
|
||||
@ -3346,9 +3345,9 @@ checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0"
|
||||
|
||||
[[package]]
|
||||
name = "winnow"
|
||||
version = "0.6.11"
|
||||
version = "0.6.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "56c52728401e1dc672a56e81e593e912aa54c78f40246869f78359a2bf24d29d"
|
||||
checksum = "59b5e5f6c299a3c7890b876a2a587f3115162487e704907d9b6cd29473052ba1"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
13
Cargo.toml
13
Cargo.toml
@ -19,7 +19,6 @@ pg16 = ["pgrx/pg16"]
|
||||
[dependencies]
|
||||
arrayvec.workspace = true
|
||||
bincode.workspace = true
|
||||
bytemuck.workspace = true
|
||||
byteorder.workspace = true
|
||||
chrono = "0.4.38"
|
||||
csv = "1.3.0"
|
||||
@ -70,18 +69,12 @@ edition = "2021"
|
||||
arc-swap = "1.7.0"
|
||||
arrayvec = "0.7.4"
|
||||
bincode = "1.3.3"
|
||||
bytemuck = { version = "1.14.3", features = ["extern_crate_alloc"] }
|
||||
byteorder = "1.5.0"
|
||||
half = { version = "2.4.0", features = [
|
||||
"bytemuck",
|
||||
"num-traits",
|
||||
"rand_distr",
|
||||
"serde",
|
||||
] }
|
||||
half = { version = "2.4.0", features = ["num-traits", "rand_distr", "serde"] }
|
||||
libc = "0.2.153"
|
||||
log = "0.4.21"
|
||||
memmap2 = "0.9.4"
|
||||
num-traits = "0.2.18"
|
||||
num-traits = "0.2.19"
|
||||
parking_lot = "0.12.1"
|
||||
paste = "1.0.14"
|
||||
rand = "0.8.5"
|
||||
@ -94,7 +87,7 @@ uuid = { version = "1.7.0", features = ["serde", "v4"] }
|
||||
validator = { version = "0.18.0", features = ["derive"] }
|
||||
|
||||
[workspace.lints]
|
||||
rust.unsafe_op_in_unsafe_fn = "forbid"
|
||||
rust.unsafe_op_in_unsafe_fn = "deny"
|
||||
rust.unused_lifetimes = "warn"
|
||||
rust.unused_qualifications = "warn"
|
||||
|
||||
|
@ -4,7 +4,6 @@ version.workspace = true
|
||||
edition.workspace = true
|
||||
|
||||
[dependencies]
|
||||
bytemuck.workspace = true
|
||||
half.workspace = true
|
||||
libc.workspace = true
|
||||
num-traits.workspace = true
|
||||
|
@ -106,49 +106,53 @@ pub enum StartError {
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
#[validate(schema(function = "IndexOptions::validate_index_options"))]
|
||||
#[validate(schema(function = "IndexOptions::validate_self"))]
|
||||
pub struct IndexOptions {
|
||||
#[validate(nested)]
|
||||
pub vector: VectorOptions,
|
||||
#[validate(nested)]
|
||||
pub segment: SegmentsOptions,
|
||||
#[validate(nested)]
|
||||
pub indexing: IndexingOptions,
|
||||
}
|
||||
|
||||
impl IndexOptions {
|
||||
fn validate_index_options(options: &IndexOptions) -> Result<(), ValidationError> {
|
||||
if options.vector.v != VectorKind::SVecf32
|
||||
&& options.vector.v != VectorKind::BVecf32
|
||||
&& options.vector.v != VectorKind::Veci8
|
||||
{
|
||||
return Ok(());
|
||||
fn validate_self(&self) -> Result<(), ValidationError> {
|
||||
match (self.vector.v, &self.indexing) {
|
||||
(VectorKind::Vecf32, _) => Ok(()),
|
||||
(VectorKind::Vecf16, _) => Ok(()),
|
||||
(
|
||||
_,
|
||||
IndexingOptions::Flat(FlatIndexingOptions {
|
||||
quantization: QuantizationOptions::Trivial(_),
|
||||
..
|
||||
})
|
||||
| IndexingOptions::Ivf(IvfIndexingOptions {
|
||||
quantization: QuantizationOptions::Trivial(_),
|
||||
..
|
||||
})
|
||||
| IndexingOptions::Hnsw(HnswIndexingOptions {
|
||||
quantization: QuantizationOptions::Trivial(_),
|
||||
..
|
||||
}),
|
||||
) => Ok(()),
|
||||
_ => Err(ValidationError::new("not valid index options")),
|
||||
}
|
||||
let is_trivial = match &options.indexing {
|
||||
IndexingOptions::Flat(x) => matches!(x.quantization, QuantizationOptions::Trivial(_)),
|
||||
IndexingOptions::Ivf(x) => matches!(x.quantization, QuantizationOptions::Trivial(_)),
|
||||
IndexingOptions::Hnsw(x) => matches!(x.quantization, QuantizationOptions::Trivial(_)),
|
||||
};
|
||||
if !is_trivial {
|
||||
return Err(ValidationError::new(
|
||||
"Quantization is not supported for svector, bvector, and veci8.",
|
||||
));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Validate, Alter)]
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize, Validate, Alter)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct IndexAlterableOptions {
|
||||
#[serde(default)]
|
||||
#[validate(nested)]
|
||||
pub segment: SegmentOptions,
|
||||
#[serde(default)]
|
||||
#[validate(nested)]
|
||||
pub optimizing: OptimizingOptions,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
#[validate(schema(function = "Self::validate_0"))]
|
||||
#[validate(schema(function = "Self::validate_dims"))]
|
||||
#[validate(schema(function = "Self::validate_self"))]
|
||||
pub struct VectorOptions {
|
||||
#[validate(range(min = 1, max = 1_048_575))]
|
||||
#[serde(rename = "dimensions")]
|
||||
@ -160,58 +164,50 @@ pub struct VectorOptions {
|
||||
}
|
||||
|
||||
impl VectorOptions {
|
||||
// Jaccard distance is only supported for bvector.
|
||||
pub fn validate_0(&self) -> Result<(), ValidationError> {
|
||||
if self.v != VectorKind::BVecf32 && self.d == DistanceKind::Jaccard {
|
||||
return Err(ValidationError::new(
|
||||
"Jaccard distance is only supported for bvector.",
|
||||
));
|
||||
pub fn validate_self(&self) -> Result<(), ValidationError> {
|
||||
match (self.v, self.d, self.dims) {
|
||||
(VectorKind::Vecf32, DistanceKind::L2, 1..65536) => Ok(()),
|
||||
(VectorKind::Vecf32, DistanceKind::Cos, 1..65536) => Ok(()),
|
||||
(VectorKind::Vecf32, DistanceKind::Dot, 1..65536) => Ok(()),
|
||||
(VectorKind::Vecf16, DistanceKind::L2, 1..65536) => Ok(()),
|
||||
(VectorKind::Vecf16, DistanceKind::Cos, 1..65536) => Ok(()),
|
||||
(VectorKind::Vecf16, DistanceKind::Dot, 1..65536) => Ok(()),
|
||||
(VectorKind::SVecf32, DistanceKind::L2, 1..1048576) => Ok(()),
|
||||
(VectorKind::SVecf32, DistanceKind::Cos, 1..1048576) => Ok(()),
|
||||
(VectorKind::SVecf32, DistanceKind::Dot, 1..1048576) => Ok(()),
|
||||
(VectorKind::BVecf32, DistanceKind::L2, 1..65536) => Ok(()),
|
||||
(VectorKind::BVecf32, DistanceKind::Cos, 1..65536) => Ok(()),
|
||||
(VectorKind::BVecf32, DistanceKind::Dot, 1..65536) => Ok(()),
|
||||
(VectorKind::BVecf32, DistanceKind::Jaccard, 1..65536) => Ok(()),
|
||||
(VectorKind::Veci8, DistanceKind::L2, 1..65536) => Ok(()),
|
||||
(VectorKind::Veci8, DistanceKind::Cos, 1..65536) => Ok(()),
|
||||
(VectorKind::Veci8, DistanceKind::Dot, 1..65536) => Ok(()),
|
||||
_ => Err(ValidationError::new("not valid vector options")),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn validate_dims(&self) -> Result<(), ValidationError> {
|
||||
if self.v != VectorKind::SVecf32 && self.dims > 65535 {
|
||||
return Err(ValidationError::new(
|
||||
"Except svector, the maximum number of dimensions is 65535.",
|
||||
));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Validate, Alter)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
#[validate(schema(function = "Self::validate_0"))]
|
||||
pub struct SegmentsOptions {
|
||||
#[serde(default = "SegmentsOptions::default_max_growing_segment_size")]
|
||||
pub struct SegmentOptions {
|
||||
#[serde(default = "SegmentOptions::default_max_growing_segment_size")]
|
||||
#[validate(range(min = 1, max = 4_000_000_000u32))]
|
||||
pub max_growing_segment_size: u32,
|
||||
#[serde(default = "SegmentsOptions::default_max_sealed_segment_size")]
|
||||
#[serde(default = "SegmentOptions::default_max_sealed_segment_size")]
|
||||
#[validate(range(min = 1, max = 4_000_000_000u32))]
|
||||
pub max_sealed_segment_size: u32,
|
||||
}
|
||||
|
||||
impl SegmentsOptions {
|
||||
impl SegmentOptions {
|
||||
fn default_max_growing_segment_size() -> u32 {
|
||||
20_000
|
||||
}
|
||||
fn default_max_sealed_segment_size() -> u32 {
|
||||
1_000_000
|
||||
}
|
||||
// max_growing_segment_size <= max_sealed_segment_size
|
||||
fn validate_0(&self) -> Result<(), ValidationError> {
|
||||
if self.max_growing_segment_size > self.max_sealed_segment_size {
|
||||
return Err(ValidationError::new(
|
||||
"`max_growing_segment_size` must be less than or equal to `max_sealed_segment_size`",
|
||||
));
|
||||
}
|
||||
Ok(())
|
||||
4_000_000_000u32
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for SegmentsOptions {
|
||||
impl Default for SegmentOptions {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
max_growing_segment_size: Self::default_max_growing_segment_size(),
|
||||
@ -223,30 +219,36 @@ impl Default for SegmentsOptions {
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Validate, Alter)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct OptimizingOptions {
|
||||
#[serde(default = "OptimizingOptions::default_optimizing_threads")]
|
||||
#[validate(range(min = 1, max = 65535))]
|
||||
pub optimizing_threads: u16,
|
||||
#[serde(default = "OptimizingOptions::default_sealing_secs")]
|
||||
#[validate(range(min = 1, max = 60))]
|
||||
#[validate(range(min = 1, max = 86400))]
|
||||
pub sealing_secs: u64,
|
||||
#[serde(default = "OptimizingOptions::default_sealing_size")]
|
||||
#[validate(range(min = 1, max = 4_000_000_000u32))]
|
||||
pub sealing_size: u32,
|
||||
#[serde(default = "OptimizingOptions::default_optimizing_secs")]
|
||||
#[validate(range(min = 1, max = 86400))]
|
||||
pub optimizing_secs: u64,
|
||||
#[serde(default = "OptimizingOptions::default_optimizing_threads")]
|
||||
#[validate(range(min = 1, max = 65535))]
|
||||
pub optimizing_threads: u16,
|
||||
#[serde(default = "OptimizingOptions::default_delete_threshold")]
|
||||
#[validate(range(min = 0.01, max = 1.00))]
|
||||
#[validate(range(min = 0.0001, max = 1.0000))]
|
||||
pub delete_threshold: f64,
|
||||
}
|
||||
|
||||
impl OptimizingOptions {
|
||||
fn default_optimizing_threads() -> u16 {
|
||||
1
|
||||
}
|
||||
fn default_sealing_secs() -> u64 {
|
||||
60
|
||||
10
|
||||
}
|
||||
fn default_sealing_size() -> u32 {
|
||||
1
|
||||
}
|
||||
fn default_optimizing_secs() -> u64 {
|
||||
60
|
||||
}
|
||||
fn default_optimizing_threads() -> u16 {
|
||||
1
|
||||
}
|
||||
fn default_delete_threshold() -> f64 {
|
||||
0.2
|
||||
}
|
||||
@ -255,9 +257,10 @@ impl OptimizingOptions {
|
||||
impl Default for OptimizingOptions {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
optimizing_threads: Self::default_optimizing_threads(),
|
||||
sealing_secs: Self::default_sealing_secs(),
|
||||
sealing_size: Self::default_sealing_size(),
|
||||
optimizing_secs: Self::default_optimizing_secs(),
|
||||
optimizing_threads: Self::default_optimizing_threads(),
|
||||
delete_threshold: Self::default_delete_threshold(),
|
||||
}
|
||||
}
|
||||
@ -328,45 +331,24 @@ impl Default for FlatIndexingOptions {
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct IvfIndexingOptions {
|
||||
#[serde(default = "IvfIndexingOptions::default_least_iterations")]
|
||||
#[validate(range(min = 1, max = 1_000_000))]
|
||||
pub least_iterations: u32,
|
||||
#[serde(default = "IvfIndexingOptions::default_iterations")]
|
||||
#[validate(range(min = 1, max = 1_000_000))]
|
||||
pub iterations: u32,
|
||||
#[serde(default = "IvfIndexingOptions::default_nlist")]
|
||||
#[validate(range(min = 1, max = 1_000_000))]
|
||||
pub nlist: u32,
|
||||
#[serde(default = "IvfIndexingOptions::default_nsample")]
|
||||
#[validate(range(min = 1, max = 1_000_000))]
|
||||
pub nsample: u32,
|
||||
#[serde(default)]
|
||||
#[validate(nested)]
|
||||
pub quantization: QuantizationOptions,
|
||||
}
|
||||
|
||||
impl IvfIndexingOptions {
|
||||
fn default_least_iterations() -> u32 {
|
||||
16
|
||||
}
|
||||
fn default_iterations() -> u32 {
|
||||
500
|
||||
}
|
||||
fn default_nlist() -> u32 {
|
||||
1000
|
||||
}
|
||||
fn default_nsample() -> u32 {
|
||||
65536
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for IvfIndexingOptions {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
least_iterations: Self::default_least_iterations(),
|
||||
iterations: Self::default_iterations(),
|
||||
nlist: Self::default_nlist(),
|
||||
nsample: Self::default_nsample(),
|
||||
quantization: Default::default(),
|
||||
}
|
||||
}
|
||||
@ -430,6 +412,15 @@ impl Default for QuantizationOptions {
|
||||
}
|
||||
}
|
||||
|
||||
impl QuantizationOptions {
|
||||
pub fn unwrap_product(self) -> ProductQuantizationOptions {
|
||||
let QuantizationOptions::Product(x) = self else {
|
||||
unreachable!()
|
||||
};
|
||||
x
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TrivialQuantizationOptions {}
|
||||
@ -453,23 +444,13 @@ impl Default for ScalarQuantizationOptions {
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct ProductQuantizationOptions {
|
||||
#[serde(default = "ProductQuantizationOptions::default_sample")]
|
||||
#[validate(range(min = 1, max = 1_000_000))]
|
||||
pub sample: u32,
|
||||
#[serde(default)]
|
||||
pub ratio: ProductQuantizationOptionsRatio,
|
||||
}
|
||||
|
||||
impl ProductQuantizationOptions {
|
||||
fn default_sample() -> u32 {
|
||||
65535
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ProductQuantizationOptions {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
sample: Self::default_sample(),
|
||||
ratio: Default::default(),
|
||||
}
|
||||
}
|
||||
@ -497,8 +478,10 @@ impl Default for ProductQuantizationOptionsRatio {
|
||||
pub struct SearchOptions {
|
||||
#[validate(range(min = 1, max = 65535))]
|
||||
pub hnsw_ef_search: u32,
|
||||
#[validate(range(min = 1, max = 1_000_000))]
|
||||
#[validate(range(min = 1, max = 65535))]
|
||||
pub ivf_nprobe: u32,
|
||||
#[validate(range(min = 1, max = 65535))]
|
||||
pub diskann_ef_search: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
|
@ -1,5 +1,4 @@
|
||||
#![feature(core_intrinsics)]
|
||||
#![feature(doc_cfg)]
|
||||
#![feature(avx512_target_feature)]
|
||||
#![cfg_attr(target_arch = "x86_64", feature(stdarch_x86_avx512))]
|
||||
#![allow(internal_features)]
|
||||
@ -12,6 +11,7 @@
|
||||
pub mod distance;
|
||||
pub mod index;
|
||||
pub mod operator;
|
||||
pub mod pod;
|
||||
pub mod scalar;
|
||||
pub mod search;
|
||||
pub mod vector;
|
||||
|
57
crates/base/src/pod.rs
Normal file
57
crates/base/src/pod.rs
Normal file
@ -0,0 +1,57 @@
|
||||
// This module is a workaround for orphan rules
|
||||
|
||||
use crate::scalar::{F16, F32, I8};
|
||||
|
||||
/// # Safety
|
||||
///
|
||||
/// * No uninitialized bytes.
|
||||
/// * Can be safely zero-initialized.
|
||||
/// * Inhabited.
|
||||
pub unsafe trait Pod: Copy {}
|
||||
|
||||
unsafe impl Pod for u8 {}
|
||||
unsafe impl Pod for u16 {}
|
||||
unsafe impl Pod for u32 {}
|
||||
unsafe impl Pod for u64 {}
|
||||
unsafe impl Pod for u128 {}
|
||||
unsafe impl Pod for usize {}
|
||||
|
||||
unsafe impl Pod for i8 {}
|
||||
unsafe impl Pod for i16 {}
|
||||
unsafe impl Pod for i32 {}
|
||||
unsafe impl Pod for i64 {}
|
||||
unsafe impl Pod for i128 {}
|
||||
unsafe impl Pod for isize {}
|
||||
|
||||
unsafe impl Pod for f32 {}
|
||||
unsafe impl Pod for f64 {}
|
||||
|
||||
unsafe impl Pod for I8 {}
|
||||
unsafe impl Pod for F16 {}
|
||||
unsafe impl Pod for F32 {}
|
||||
|
||||
unsafe impl Pod for (F32, u32) {}
|
||||
|
||||
unsafe impl Pod for crate::search::Payload {}
|
||||
|
||||
pub fn bytes_of<T: Pod>(t: &T) -> &[u8] {
|
||||
unsafe {
|
||||
core::slice::from_raw_parts(
|
||||
std::ptr::addr_of!(*t) as *const u8,
|
||||
std::mem::size_of::<T>(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn zeroed_vec<T: Pod>(length: usize) -> Vec<T> {
|
||||
unsafe {
|
||||
let mut v = Vec::with_capacity(length);
|
||||
std::ptr::write_bytes(v.as_mut_ptr(), 0, length);
|
||||
v.set_len(length);
|
||||
v
|
||||
}
|
||||
}
|
||||
|
||||
pub fn try_pod_read_unaligned<T: Pod>(bytes: &[u8]) -> T {
|
||||
unsafe { (bytes.as_ptr() as *const T).read_unaligned() }
|
||||
}
|
@ -41,6 +41,7 @@ impl Display for F32 {
|
||||
}
|
||||
|
||||
impl PartialEq for F32 {
|
||||
#[inline(always)]
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.0.total_cmp(&other.0) == Ordering::Equal
|
||||
}
|
||||
@ -62,143 +63,171 @@ impl Ord for F32 {
|
||||
}
|
||||
}
|
||||
|
||||
unsafe impl bytemuck::Zeroable for F32 {}
|
||||
|
||||
unsafe impl bytemuck::Pod for F32 {}
|
||||
|
||||
impl Zero for F32 {
|
||||
#[inline(always)]
|
||||
fn zero() -> Self {
|
||||
Self(f32::zero())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn is_zero(&self) -> bool {
|
||||
self.0.is_zero()
|
||||
}
|
||||
}
|
||||
|
||||
impl num_traits::One for F32 {
|
||||
#[inline(always)]
|
||||
fn one() -> Self {
|
||||
Self(f32::one())
|
||||
}
|
||||
}
|
||||
|
||||
impl num_traits::FromPrimitive for F32 {
|
||||
#[inline(always)]
|
||||
fn from_i64(n: i64) -> Option<Self> {
|
||||
f32::from_i64(n).map(Self)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn from_u64(n: u64) -> Option<Self> {
|
||||
f32::from_u64(n).map(Self)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn from_isize(n: isize) -> Option<Self> {
|
||||
f32::from_isize(n).map(Self)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn from_i8(n: i8) -> Option<Self> {
|
||||
f32::from_i8(n).map(Self)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn from_i16(n: i16) -> Option<Self> {
|
||||
f32::from_i16(n).map(Self)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn from_i32(n: i32) -> Option<Self> {
|
||||
f32::from_i32(n).map(Self)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn from_i128(n: i128) -> Option<Self> {
|
||||
f32::from_i128(n).map(Self)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn from_usize(n: usize) -> Option<Self> {
|
||||
f32::from_usize(n).map(Self)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn from_u8(n: u8) -> Option<Self> {
|
||||
f32::from_u8(n).map(Self)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn from_u16(n: u16) -> Option<Self> {
|
||||
f32::from_u16(n).map(Self)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn from_u32(n: u32) -> Option<Self> {
|
||||
f32::from_u32(n).map(Self)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn from_u128(n: u128) -> Option<Self> {
|
||||
f32::from_u128(n).map(Self)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn from_f32(n: f32) -> Option<Self> {
|
||||
f32::from_f32(n).map(Self)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn from_f64(n: f64) -> Option<Self> {
|
||||
f32::from_f64(n).map(Self)
|
||||
}
|
||||
}
|
||||
|
||||
impl num_traits::ToPrimitive for F32 {
|
||||
#[inline(always)]
|
||||
fn to_i64(&self) -> Option<i64> {
|
||||
self.0.to_i64()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_u64(&self) -> Option<u64> {
|
||||
self.0.to_u64()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_isize(&self) -> Option<isize> {
|
||||
self.0.to_isize()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_i8(&self) -> Option<i8> {
|
||||
self.0.to_i8()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_i16(&self) -> Option<i16> {
|
||||
self.0.to_i16()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_i32(&self) -> Option<i32> {
|
||||
self.0.to_i32()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_i128(&self) -> Option<i128> {
|
||||
self.0.to_i128()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_usize(&self) -> Option<usize> {
|
||||
self.0.to_usize()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_u8(&self) -> Option<u8> {
|
||||
self.0.to_u8()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_u16(&self) -> Option<u16> {
|
||||
self.0.to_u16()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_u32(&self) -> Option<u32> {
|
||||
self.0.to_u32()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_u128(&self) -> Option<u128> {
|
||||
self.0.to_u128()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_f32(&self) -> Option<f32> {
|
||||
self.0.to_f32()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_f64(&self) -> Option<f64> {
|
||||
self.0.to_f64()
|
||||
}
|
||||
}
|
||||
|
||||
impl num_traits::NumCast for F32 {
|
||||
#[inline(always)]
|
||||
fn from<T: num_traits::ToPrimitive>(n: T) -> Option<Self> {
|
||||
num_traits::NumCast::from(n).map(Self)
|
||||
}
|
||||
@ -213,239 +242,298 @@ impl num_traits::Num for F32 {
|
||||
}
|
||||
|
||||
impl num_traits::Float for F32 {
|
||||
#[inline(always)]
|
||||
fn nan() -> Self {
|
||||
Self(f32::nan())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn infinity() -> Self {
|
||||
Self(f32::infinity())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn neg_infinity() -> Self {
|
||||
Self(f32::neg_infinity())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn neg_zero() -> Self {
|
||||
Self(f32::neg_zero())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn min_value() -> Self {
|
||||
Self(f32::min_value())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn min_positive_value() -> Self {
|
||||
Self(f32::min_positive_value())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn max_value() -> Self {
|
||||
Self(f32::max_value())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn is_nan(self) -> bool {
|
||||
self.0.is_nan()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn is_infinite(self) -> bool {
|
||||
self.0.is_infinite()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn is_finite(self) -> bool {
|
||||
self.0.is_finite()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn is_normal(self) -> bool {
|
||||
self.0.is_normal()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn classify(self) -> std::num::FpCategory {
|
||||
self.0.classify()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn floor(self) -> Self {
|
||||
Self(self.0.floor())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn ceil(self) -> Self {
|
||||
Self(self.0.ceil())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn round(self) -> Self {
|
||||
Self(self.0.round())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn trunc(self) -> Self {
|
||||
Self(self.0.trunc())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn fract(self) -> Self {
|
||||
Self(self.0.fract())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn abs(self) -> Self {
|
||||
Self(self.0.abs())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn signum(self) -> Self {
|
||||
Self(self.0.signum())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn is_sign_positive(self) -> bool {
|
||||
self.0.is_sign_positive()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn is_sign_negative(self) -> bool {
|
||||
self.0.is_sign_negative()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn mul_add(self, a: Self, b: Self) -> Self {
|
||||
Self(self.0.mul_add(a.0, b.0))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn recip(self) -> Self {
|
||||
Self(self.0.recip())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn powi(self, n: i32) -> Self {
|
||||
Self(self.0.powi(n))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn powf(self, n: Self) -> Self {
|
||||
Self(self.0.powf(n.0))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn sqrt(self) -> Self {
|
||||
Self(self.0.sqrt())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn exp(self) -> Self {
|
||||
Self(self.0.exp())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn exp2(self) -> Self {
|
||||
Self(self.0.exp2())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn ln(self) -> Self {
|
||||
Self(self.0.ln())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn log(self, base: Self) -> Self {
|
||||
Self(self.0.log(base.0))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn log2(self) -> Self {
|
||||
Self(self.0.log2())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn log10(self) -> Self {
|
||||
Self(self.0.log10())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn max(self, other: Self) -> Self {
|
||||
Self(self.0.max(other.0))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn min(self, other: Self) -> Self {
|
||||
Self(self.0.min(other.0))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn abs_sub(self, _: Self) -> Self {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn cbrt(self) -> Self {
|
||||
Self(self.0.cbrt())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn hypot(self, other: Self) -> Self {
|
||||
Self(self.0.hypot(other.0))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn sin(self) -> Self {
|
||||
Self(self.0.sin())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn cos(self) -> Self {
|
||||
Self(self.0.cos())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn tan(self) -> Self {
|
||||
Self(self.0.tan())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn asin(self) -> Self {
|
||||
Self(self.0.asin())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn acos(self) -> Self {
|
||||
Self(self.0.acos())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn atan(self) -> Self {
|
||||
Self(self.0.atan())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn atan2(self, other: Self) -> Self {
|
||||
Self(self.0.atan2(other.0))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn sin_cos(self) -> (Self, Self) {
|
||||
let (_x, _y) = self.0.sin_cos();
|
||||
(Self(_x), Self(_y))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn exp_m1(self) -> Self {
|
||||
Self(self.0.exp_m1())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn ln_1p(self) -> Self {
|
||||
Self(self.0.ln_1p())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn sinh(self) -> Self {
|
||||
Self(self.0.sinh())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn cosh(self) -> Self {
|
||||
Self(self.0.cosh())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn tanh(self) -> Self {
|
||||
Self(self.0.tanh())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn asinh(self) -> Self {
|
||||
Self(self.0.asinh())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn acosh(self) -> Self {
|
||||
Self(self.0.acosh())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn atanh(self) -> Self {
|
||||
Self(self.0.atanh())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn integer_decode(self) -> (u64, i16, i8) {
|
||||
self.0.integer_decode()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn epsilon() -> Self {
|
||||
Self(f32::EPSILON)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn is_subnormal(self) -> bool {
|
||||
self.0.classify() == std::num::FpCategory::Subnormal
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_degrees(self) -> Self {
|
||||
Self(self.0.to_degrees())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_radians(self) -> Self {
|
||||
Self(self.0.to_radians())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn copysign(self, sign: Self) -> Self {
|
||||
Self(self.0.copysign(sign.0))
|
||||
}
|
||||
@ -540,6 +628,7 @@ impl RemAssign<F32> for F32 {
|
||||
impl Neg for F32 {
|
||||
type Output = Self;
|
||||
|
||||
#[inline(always)]
|
||||
fn neg(self) -> Self::Output {
|
||||
Self(self.0.neg())
|
||||
}
|
||||
@ -554,12 +643,14 @@ impl FromStr for F32 {
|
||||
}
|
||||
|
||||
impl From<f32> for F32 {
|
||||
#[inline(always)]
|
||||
fn from(value: f32) -> Self {
|
||||
Self(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<F32> for f32 {
|
||||
#[inline(always)]
|
||||
fn from(F32(float): F32) -> Self {
|
||||
float
|
||||
}
|
||||
@ -575,6 +666,7 @@ impl Add<f32> for F32 {
|
||||
}
|
||||
|
||||
impl AddAssign<f32> for F32 {
|
||||
#[inline(always)]
|
||||
fn add_assign(&mut self, rhs: f32) {
|
||||
self.0 = intrinsics::fadd_algebraic(self.0, rhs)
|
||||
}
|
||||
@ -668,18 +760,22 @@ mod intrinsics {
|
||||
}
|
||||
|
||||
impl ScalarLike for F32 {
|
||||
#[inline(always)]
|
||||
fn from_f32(x: f32) -> Self {
|
||||
Self(x)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_f32(self) -> f32 {
|
||||
self.0
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn from_f(x: F32) -> Self {
|
||||
Self::from_f32(x.0)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_f(self) -> F32 {
|
||||
F32(Self::to_f32(self))
|
||||
}
|
||||
|
@ -28,6 +28,7 @@ impl Display for F16 {
|
||||
}
|
||||
|
||||
impl PartialEq for F16 {
|
||||
#[inline(always)]
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.0.total_cmp(&other.0) == Ordering::Equal
|
||||
}
|
||||
@ -49,143 +50,171 @@ impl Ord for F16 {
|
||||
}
|
||||
}
|
||||
|
||||
unsafe impl bytemuck::Zeroable for F16 {}
|
||||
|
||||
unsafe impl bytemuck::Pod for F16 {}
|
||||
|
||||
impl Zero for F16 {
|
||||
#[inline(always)]
|
||||
fn zero() -> Self {
|
||||
Self(f16::zero())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn is_zero(&self) -> bool {
|
||||
self.0.is_zero()
|
||||
}
|
||||
}
|
||||
|
||||
impl num_traits::One for F16 {
|
||||
#[inline(always)]
|
||||
fn one() -> Self {
|
||||
Self(f16::one())
|
||||
}
|
||||
}
|
||||
|
||||
impl num_traits::FromPrimitive for F16 {
|
||||
#[inline(always)]
|
||||
fn from_i64(n: i64) -> Option<Self> {
|
||||
f16::from_i64(n).map(Self)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn from_u64(n: u64) -> Option<Self> {
|
||||
f16::from_u64(n).map(Self)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn from_isize(n: isize) -> Option<Self> {
|
||||
f16::from_isize(n).map(Self)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn from_i8(n: i8) -> Option<Self> {
|
||||
f16::from_i8(n).map(Self)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn from_i16(n: i16) -> Option<Self> {
|
||||
f16::from_i16(n).map(Self)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn from_i32(n: i32) -> Option<Self> {
|
||||
f16::from_i32(n).map(Self)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn from_i128(n: i128) -> Option<Self> {
|
||||
f16::from_i128(n).map(Self)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn from_usize(n: usize) -> Option<Self> {
|
||||
f16::from_usize(n).map(Self)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn from_u8(n: u8) -> Option<Self> {
|
||||
f16::from_u8(n).map(Self)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn from_u16(n: u16) -> Option<Self> {
|
||||
f16::from_u16(n).map(Self)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn from_u32(n: u32) -> Option<Self> {
|
||||
f16::from_u32(n).map(Self)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn from_u128(n: u128) -> Option<Self> {
|
||||
f16::from_u128(n).map(Self)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn from_f32(n: f32) -> Option<Self> {
|
||||
Some(Self(f16::from_f32(n)))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn from_f64(n: f64) -> Option<Self> {
|
||||
Some(Self(f16::from_f64(n)))
|
||||
}
|
||||
}
|
||||
|
||||
impl num_traits::ToPrimitive for F16 {
|
||||
#[inline(always)]
|
||||
fn to_i64(&self) -> Option<i64> {
|
||||
self.0.to_i64()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_u64(&self) -> Option<u64> {
|
||||
self.0.to_u64()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_isize(&self) -> Option<isize> {
|
||||
self.0.to_isize()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_i8(&self) -> Option<i8> {
|
||||
self.0.to_i8()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_i16(&self) -> Option<i16> {
|
||||
self.0.to_i16()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_i32(&self) -> Option<i32> {
|
||||
self.0.to_i32()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_i128(&self) -> Option<i128> {
|
||||
self.0.to_i128()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_usize(&self) -> Option<usize> {
|
||||
self.0.to_usize()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_u8(&self) -> Option<u8> {
|
||||
self.0.to_u8()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_u16(&self) -> Option<u16> {
|
||||
self.0.to_u16()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_u32(&self) -> Option<u32> {
|
||||
self.0.to_u32()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_u128(&self) -> Option<u128> {
|
||||
self.0.to_u128()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_f32(&self) -> Option<f32> {
|
||||
Some(self.0.to_f32())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_f64(&self) -> Option<f64> {
|
||||
Some(self.0.to_f64())
|
||||
}
|
||||
}
|
||||
|
||||
impl num_traits::NumCast for F16 {
|
||||
#[inline(always)]
|
||||
fn from<T: num_traits::ToPrimitive>(n: T) -> Option<Self> {
|
||||
num_traits::NumCast::from(n).map(Self)
|
||||
}
|
||||
@ -200,239 +229,297 @@ impl num_traits::Num for F16 {
|
||||
}
|
||||
|
||||
impl num_traits::Float for F16 {
|
||||
#[inline(always)]
|
||||
fn nan() -> Self {
|
||||
Self(f16::nan())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn infinity() -> Self {
|
||||
Self(f16::infinity())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn neg_infinity() -> Self {
|
||||
Self(f16::neg_infinity())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn neg_zero() -> Self {
|
||||
Self(f16::neg_zero())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn min_value() -> Self {
|
||||
Self(f16::min_value())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn min_positive_value() -> Self {
|
||||
Self(f16::min_positive_value())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn max_value() -> Self {
|
||||
Self(f16::max_value())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn is_nan(self) -> bool {
|
||||
self.0.is_nan()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn is_infinite(self) -> bool {
|
||||
self.0.is_infinite()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn is_finite(self) -> bool {
|
||||
self.0.is_finite()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn is_normal(self) -> bool {
|
||||
self.0.is_normal()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn classify(self) -> std::num::FpCategory {
|
||||
self.0.classify()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn floor(self) -> Self {
|
||||
Self(self.0.floor())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn ceil(self) -> Self {
|
||||
Self(self.0.ceil())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn round(self) -> Self {
|
||||
Self(self.0.round())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn trunc(self) -> Self {
|
||||
Self(self.0.trunc())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn fract(self) -> Self {
|
||||
Self(self.0.fract())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn abs(self) -> Self {
|
||||
Self(self.0.abs())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn signum(self) -> Self {
|
||||
Self(self.0.signum())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn is_sign_positive(self) -> bool {
|
||||
self.0.is_sign_positive()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn is_sign_negative(self) -> bool {
|
||||
self.0.is_sign_negative()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn mul_add(self, a: Self, b: Self) -> Self {
|
||||
Self(self.0.mul_add(a.0, b.0))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn recip(self) -> Self {
|
||||
Self(self.0.recip())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn powi(self, n: i32) -> Self {
|
||||
Self(self.0.powi(n))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn powf(self, n: Self) -> Self {
|
||||
Self(self.0.powf(n.0))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn sqrt(self) -> Self {
|
||||
Self(self.0.sqrt())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn exp(self) -> Self {
|
||||
Self(self.0.exp())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn exp2(self) -> Self {
|
||||
Self(self.0.exp2())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn ln(self) -> Self {
|
||||
Self(self.0.ln())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn log(self, base: Self) -> Self {
|
||||
Self(self.0.log(base.0))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn log2(self) -> Self {
|
||||
Self(self.0.log2())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn log10(self) -> Self {
|
||||
Self(self.0.log10())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn max(self, other: Self) -> Self {
|
||||
Self(self.0.max(other.0))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn min(self, other: Self) -> Self {
|
||||
Self(self.0.min(other.0))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn abs_sub(self, _: Self) -> Self {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn cbrt(self) -> Self {
|
||||
Self(self.0.cbrt())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn hypot(self, other: Self) -> Self {
|
||||
Self(self.0.hypot(other.0))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn sin(self) -> Self {
|
||||
Self(self.0.sin())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn cos(self) -> Self {
|
||||
Self(self.0.cos())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn tan(self) -> Self {
|
||||
Self(self.0.tan())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn asin(self) -> Self {
|
||||
Self(self.0.asin())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn acos(self) -> Self {
|
||||
Self(self.0.acos())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn atan(self) -> Self {
|
||||
Self(self.0.atan())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn atan2(self, other: Self) -> Self {
|
||||
Self(self.0.atan2(other.0))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn sin_cos(self) -> (Self, Self) {
|
||||
let (_x, _y) = self.0.sin_cos();
|
||||
(Self(_x), Self(_y))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn exp_m1(self) -> Self {
|
||||
Self(self.0.exp_m1())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn ln_1p(self) -> Self {
|
||||
Self(self.0.ln_1p())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn sinh(self) -> Self {
|
||||
Self(self.0.sinh())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn cosh(self) -> Self {
|
||||
Self(self.0.cosh())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn tanh(self) -> Self {
|
||||
Self(self.0.tanh())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn asinh(self) -> Self {
|
||||
Self(self.0.asinh())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn acosh(self) -> Self {
|
||||
Self(self.0.acosh())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn atanh(self) -> Self {
|
||||
Self(self.0.atanh())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn integer_decode(self) -> (u64, i16, i8) {
|
||||
self.0.integer_decode()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn epsilon() -> Self {
|
||||
Self(f16::EPSILON)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn is_subnormal(self) -> bool {
|
||||
self.0.classify() == std::num::FpCategory::Subnormal
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_degrees(self) -> Self {
|
||||
Self(self.0.to_degrees())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_radians(self) -> Self {
|
||||
Self(self.0.to_radians())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn copysign(self, sign: Self) -> Self {
|
||||
Self(self.0.copysign(sign.0))
|
||||
}
|
||||
@ -527,6 +614,7 @@ impl RemAssign<F16> for F16 {
|
||||
impl Neg for F16 {
|
||||
type Output = Self;
|
||||
|
||||
#[inline(always)]
|
||||
fn neg(self) -> Self::Output {
|
||||
Self(self.0.neg())
|
||||
}
|
||||
@ -541,12 +629,14 @@ impl FromStr for F16 {
|
||||
}
|
||||
|
||||
impl From<f16> for F16 {
|
||||
#[inline(always)]
|
||||
fn from(value: f16) -> Self {
|
||||
Self(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<F16> for f16 {
|
||||
#[inline(always)]
|
||||
fn from(F16(float): F16) -> Self {
|
||||
float
|
||||
}
|
||||
@ -562,6 +652,7 @@ impl Add<f16> for F16 {
|
||||
}
|
||||
|
||||
impl AddAssign<f16> for F16 {
|
||||
#[inline(always)]
|
||||
fn add_assign(&mut self, rhs: f16) {
|
||||
self.0 = intrinsics::fadd_algebraic(self.0, rhs)
|
||||
}
|
||||
@ -657,16 +748,22 @@ mod intrinsics {
|
||||
}
|
||||
|
||||
impl ScalarLike for F16 {
|
||||
#[inline(always)]
|
||||
fn from_f32(x: f32) -> Self {
|
||||
Self(f16::from_f32(x))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_f32(self) -> f32 {
|
||||
f16::to_f32(self.0)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn from_f(x: F32) -> Self {
|
||||
Self::from_f32(x.0)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_f(self) -> F32 {
|
||||
F32(Self::to_f32(self))
|
||||
}
|
||||
|
@ -43,10 +43,6 @@ impl Ord for I8 {
|
||||
}
|
||||
}
|
||||
|
||||
unsafe impl bytemuck::Zeroable for I8 {}
|
||||
|
||||
unsafe impl bytemuck::Pod for I8 {}
|
||||
|
||||
impl From<i8> for I8 {
|
||||
fn from(value: i8) -> Self {
|
||||
Self(value)
|
||||
|
@ -15,12 +15,11 @@ pub trait ScalarLike:
|
||||
+ serde::Serialize
|
||||
+ for<'a> serde::Deserialize<'a>
|
||||
+ Ord
|
||||
+ bytemuck::Zeroable
|
||||
+ bytemuck::Pod
|
||||
+ num_traits::Float
|
||||
+ num_traits::Zero
|
||||
+ num_traits::NumOps
|
||||
+ num_traits::NumAssignOps
|
||||
+ crate::pod::Pod
|
||||
{
|
||||
fn from_f32(x: f32) -> Self;
|
||||
fn to_f32(self) -> f32;
|
||||
|
@ -1,6 +1,7 @@
|
||||
use crate::operator::{Borrowed, Operator};
|
||||
use crate::scalar::F32;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::any::Any;
|
||||
use std::fmt::Display;
|
||||
|
||||
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
|
||||
@ -65,22 +66,24 @@ impl Payload {
|
||||
}
|
||||
}
|
||||
|
||||
unsafe impl bytemuck::Zeroable for Payload {}
|
||||
unsafe impl bytemuck::Pod for Payload {}
|
||||
|
||||
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
|
||||
pub struct Element {
|
||||
pub distance: F32,
|
||||
pub payload: Payload,
|
||||
}
|
||||
|
||||
pub trait Collection<O: Operator> {
|
||||
pub trait Vectors<O: Operator>: Send + Sync {
|
||||
fn dims(&self) -> u32;
|
||||
fn len(&self) -> u32;
|
||||
fn vector(&self, i: u32) -> Borrowed<'_, O>;
|
||||
}
|
||||
|
||||
pub trait Collection<O: Operator>: Vectors<O> {
|
||||
fn payload(&self, i: u32) -> Payload;
|
||||
}
|
||||
|
||||
pub trait Source<O: Operator>: Collection<O> {
|
||||
// ..
|
||||
fn get_main<T: Any>(&self) -> Option<&T>;
|
||||
fn get_main_len(&self) -> u32;
|
||||
fn check_existing(&self, i: u32) -> bool;
|
||||
}
|
||||
|
@ -196,8 +196,7 @@ impl<'a> PartialOrd for BVecf32Borrowed<'a> {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
#[cfg(any(target_arch = "x86_64", doc))]
|
||||
#[doc(cfg(target_arch = "x86_64"))]
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[detect::target_cpu(enable = "v4_avx512vpopcntdq")]
|
||||
unsafe fn cosine_v4_avx512vpopcntdq(lhs: BVecf32Borrowed<'_>, rhs: BVecf32Borrowed<'_>) -> F32 {
|
||||
use std::arch::x86_64::*;
|
||||
@ -278,8 +277,7 @@ pub fn cosine(lhs: BVecf32Borrowed<'_>, rhs: BVecf32Borrowed<'_>) -> F32 {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
#[cfg(any(target_arch = "x86_64", doc))]
|
||||
#[doc(cfg(target_arch = "x86_64"))]
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[detect::target_cpu(enable = "v4_avx512vpopcntdq")]
|
||||
unsafe fn dot_v4_avx512vpopcntdq(lhs: BVecf32Borrowed<'_>, rhs: BVecf32Borrowed<'_>) -> F32 {
|
||||
use std::arch::x86_64::*;
|
||||
@ -345,8 +343,7 @@ pub fn dot(lhs: BVecf32Borrowed<'_>, rhs: BVecf32Borrowed<'_>) -> F32 {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
#[cfg(any(target_arch = "x86_64", doc))]
|
||||
#[doc(cfg(target_arch = "x86_64"))]
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[detect::target_cpu(enable = "v4_avx512vpopcntdq")]
|
||||
unsafe fn sl2_v4_avx512vpopcntdq(lhs: BVecf32Borrowed<'_>, rhs: BVecf32Borrowed<'_>) -> F32 {
|
||||
use std::arch::x86_64::*;
|
||||
@ -412,8 +409,7 @@ pub fn sl2(lhs: BVecf32Borrowed<'_>, rhs: BVecf32Borrowed<'_>) -> F32 {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
#[cfg(any(target_arch = "x86_64", doc))]
|
||||
#[doc(cfg(target_arch = "x86_64"))]
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[detect::target_cpu(enable = "v4_avx512vpopcntdq")]
|
||||
unsafe fn jaccard_v4_avx512vpopcntdq(lhs: BVecf32Borrowed<'_>, rhs: BVecf32Borrowed<'_>) -> F32 {
|
||||
use std::arch::x86_64::*;
|
||||
@ -485,8 +481,7 @@ pub fn jaccard(lhs: BVecf32Borrowed<'_>, rhs: BVecf32Borrowed<'_>) -> F32 {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
#[cfg(any(target_arch = "x86_64", doc))]
|
||||
#[doc(cfg(target_arch = "x86_64"))]
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[detect::target_cpu(enable = "v4_avx512vpopcntdq")]
|
||||
unsafe fn length_v4_avx512vpopcntdq(vector: BVecf32Borrowed<'_>) -> F32 {
|
||||
use std::arch::x86_64::*;
|
||||
|
@ -207,8 +207,7 @@ impl<'a> VectorBorrowed for SVecf32Borrowed<'a> {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
#[cfg(any(target_arch = "x86_64", doc))]
|
||||
#[doc(cfg(target_arch = "x86_64"))]
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[detect::target_cpu(enable = "v4")]
|
||||
unsafe fn cosine_v4(lhs: SVecf32Borrowed<'_>, rhs: SVecf32Borrowed<'_>) -> F32 {
|
||||
use std::arch::x86_64::*;
|
||||
@ -375,8 +374,7 @@ pub fn cosine(lhs: SVecf32Borrowed<'_>, rhs: SVecf32Borrowed<'_>) -> F32 {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
#[cfg(any(target_arch = "x86_64", doc))]
|
||||
#[doc(cfg(target_arch = "x86_64"))]
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[detect::target_cpu(enable = "v4")]
|
||||
unsafe fn dot_v4(lhs: SVecf32Borrowed<'_>, rhs: SVecf32Borrowed<'_>) -> F32 {
|
||||
use std::arch::x86_64::*;
|
||||
@ -509,8 +507,7 @@ pub fn dot_2(lhs: SVecf32Borrowed<'_>, rhs: &[F32]) -> F32 {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
#[cfg(any(target_arch = "x86_64", doc))]
|
||||
#[doc(cfg(target_arch = "x86_64"))]
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[detect::target_cpu(enable = "v4")]
|
||||
unsafe fn sl2_v4(lhs: SVecf32Borrowed<'_>, rhs: SVecf32Borrowed<'_>) -> F32 {
|
||||
use std::arch::x86_64::*;
|
||||
|
@ -127,8 +127,7 @@ impl<'a> VectorBorrowed for Vecf16Borrowed<'a> {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
#[cfg(any(target_arch = "x86_64", doc))]
|
||||
#[doc(cfg(target_arch = "x86_64"))]
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
unsafe fn cosine_v4_avx512fp16(lhs: &[F16], rhs: &[F16]) -> F32 {
|
||||
assert!(lhs.len() == rhs.len());
|
||||
let n = lhs.len();
|
||||
@ -159,7 +158,6 @@ fn cosine_v4_avx512fp16_test() {
|
||||
|
||||
#[inline]
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[doc(cfg(target_arch = "x86_64"))]
|
||||
unsafe fn cosine_v4(lhs: &[F16], rhs: &[F16]) -> F32 {
|
||||
assert!(lhs.len() == rhs.len());
|
||||
let n = lhs.len();
|
||||
@ -190,7 +188,6 @@ fn cosine_v4_test() {
|
||||
|
||||
#[inline]
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[doc(cfg(target_arch = "x86_64"))]
|
||||
unsafe fn cosine_v3(lhs: &[F16], rhs: &[F16]) -> F32 {
|
||||
assert!(lhs.len() == rhs.len());
|
||||
let n = lhs.len();
|
||||
@ -235,8 +232,7 @@ pub fn cosine(lhs: &[F16], rhs: &[F16]) -> F32 {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
#[cfg(any(target_arch = "x86_64", doc))]
|
||||
#[doc(cfg(target_arch = "x86_64"))]
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
unsafe fn dot_v4_avx512fp16(lhs: &[F16], rhs: &[F16]) -> F32 {
|
||||
assert!(lhs.len() == rhs.len());
|
||||
let n = lhs.len();
|
||||
@ -267,7 +263,6 @@ fn dot_v4_avx512fp16_test() {
|
||||
|
||||
#[inline]
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[doc(cfg(target_arch = "x86_64"))]
|
||||
unsafe fn dot_v4(lhs: &[F16], rhs: &[F16]) -> F32 {
|
||||
assert!(lhs.len() == rhs.len());
|
||||
let n = lhs.len();
|
||||
@ -298,7 +293,6 @@ fn dot_v4_test() {
|
||||
|
||||
#[inline]
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[doc(cfg(target_arch = "x86_64"))]
|
||||
unsafe fn dot_v3(lhs: &[F16], rhs: &[F16]) -> F32 {
|
||||
assert!(lhs.len() == rhs.len());
|
||||
let n = lhs.len();
|
||||
@ -339,8 +333,7 @@ pub fn dot(lhs: &[F16], rhs: &[F16]) -> F32 {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
#[cfg(any(target_arch = "x86_64", doc))]
|
||||
#[doc(cfg(target_arch = "x86_64"))]
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
unsafe fn sl2_v4_avx512fp16(lhs: &[F16], rhs: &[F16]) -> F32 {
|
||||
assert!(lhs.len() == rhs.len());
|
||||
let n = lhs.len();
|
||||
@ -371,7 +364,6 @@ fn sl2_v4_avx512fp16_test() {
|
||||
|
||||
#[inline]
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[doc(cfg(target_arch = "x86_64"))]
|
||||
unsafe fn sl2_v4(lhs: &[F16], rhs: &[F16]) -> F32 {
|
||||
assert!(lhs.len() == rhs.len());
|
||||
let n = lhs.len();
|
||||
@ -402,7 +394,6 @@ fn sl2_v4_test() {
|
||||
|
||||
#[inline]
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[doc(cfg(target_arch = "x86_64"))]
|
||||
unsafe fn sl2_v3(lhs: &[F16], rhs: &[F16]) -> F32 {
|
||||
assert!(lhs.len() == rhs.len());
|
||||
let n = lhs.len();
|
||||
@ -476,41 +467,3 @@ pub fn xy_x2_y2(lhs: &[F16], rhs: &[F16]) -> (F32, F32, F32) {
|
||||
}
|
||||
(xy, x2, y2)
|
||||
}
|
||||
|
||||
#[detect::multiversion(v4, v3, v2, neon, fallback)]
|
||||
pub fn xy_x2_y2_delta(lhs: &[F16], rhs: &[F16], del: &[F16]) -> (F32, F32, F32) {
|
||||
assert!(lhs.len() == rhs.len());
|
||||
let n = lhs.len();
|
||||
let mut xy = F32::zero();
|
||||
let mut x2 = F32::zero();
|
||||
let mut y2 = F32::zero();
|
||||
for i in 0..n {
|
||||
xy += lhs[i].to_f() * (rhs[i].to_f() + del[i].to_f());
|
||||
x2 += lhs[i].to_f() * lhs[i].to_f();
|
||||
y2 += (rhs[i].to_f() + del[i].to_f()) * (rhs[i].to_f() + del[i].to_f());
|
||||
}
|
||||
(xy, x2, y2)
|
||||
}
|
||||
|
||||
#[detect::multiversion(v4, v3, v2, neon, fallback)]
|
||||
pub fn dot_delta(lhs: &[F16], rhs: &[F16], del: &[F16]) -> F32 {
|
||||
assert!(lhs.len() == rhs.len());
|
||||
let n: usize = lhs.len();
|
||||
let mut xy = F32::zero();
|
||||
for i in 0..n {
|
||||
xy += lhs[i].to_f() * (rhs[i].to_f() + del[i].to_f());
|
||||
}
|
||||
xy
|
||||
}
|
||||
|
||||
#[detect::multiversion(v4, v3, v2, neon, fallback)]
|
||||
pub fn distance_squared_l2_delta(lhs: &[F16], rhs: &[F16], del: &[F16]) -> F32 {
|
||||
assert!(lhs.len() == rhs.len());
|
||||
let n = lhs.len();
|
||||
let mut d2 = F32::zero();
|
||||
for i in 0..n {
|
||||
let d = lhs[i].to_f() - (rhs[i].to_f() + del[i].to_f());
|
||||
d2 += d * d;
|
||||
}
|
||||
d2
|
||||
}
|
||||
|
@ -197,41 +197,3 @@ pub fn xy_x2_y2(lhs: &[F32], rhs: &[F32]) -> (F32, F32, F32) {
|
||||
}
|
||||
(xy, x2, y2)
|
||||
}
|
||||
|
||||
#[detect::multiversion(v4, v3, v2, neon, fallback)]
|
||||
pub fn xy_x2_y2_delta(lhs: &[F32], rhs: &[F32], del: &[F32]) -> (F32, F32, F32) {
|
||||
assert!(lhs.len() == rhs.len());
|
||||
let n = lhs.len();
|
||||
let mut xy = F32::zero();
|
||||
let mut x2 = F32::zero();
|
||||
let mut y2 = F32::zero();
|
||||
for i in 0..n {
|
||||
xy += lhs[i] * (rhs[i] + del[i]);
|
||||
x2 += lhs[i] * lhs[i];
|
||||
y2 += (rhs[i] + del[i]) * (rhs[i] + del[i]);
|
||||
}
|
||||
(xy, x2, y2)
|
||||
}
|
||||
|
||||
#[detect::multiversion(v4, v3, v2, neon, fallback)]
|
||||
pub fn dot_delta(lhs: &[F32], rhs: &[F32], del: &[F32]) -> F32 {
|
||||
assert!(lhs.len() == rhs.len());
|
||||
let n: usize = lhs.len();
|
||||
let mut xy = F32::zero();
|
||||
for i in 0..n {
|
||||
xy += lhs[i] * (rhs[i] + del[i]);
|
||||
}
|
||||
xy
|
||||
}
|
||||
|
||||
#[detect::multiversion(v4, v3, v2, neon, fallback)]
|
||||
pub fn distance_squared_l2_delta(lhs: &[F32], rhs: &[F32], del: &[F32]) -> F32 {
|
||||
assert!(lhs.len() == rhs.len());
|
||||
let n = lhs.len();
|
||||
let mut d2 = F32::zero();
|
||||
for i in 0..n {
|
||||
let d = lhs[i] - (rhs[i] + del[i]);
|
||||
d2 += d * d;
|
||||
}
|
||||
d2
|
||||
}
|
||||
|
@ -319,8 +319,7 @@ pub fn i8_precompute(data: &[I8], alpha: F32, offset: F32) -> (F32, F32) {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
#[cfg(any(target_arch = "x86_64", doc))]
|
||||
#[doc(cfg(target_arch = "x86_64"))]
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[detect::target_cpu(enable = "v4_avx512vnni")]
|
||||
unsafe fn dot_internal_v4_avx512vnni(x: &[I8], y: &[I8]) -> F32 {
|
||||
use std::arch::x86_64::*;
|
||||
|
@ -4,9 +4,11 @@ version.workspace = true
|
||||
edition.workspace = true
|
||||
|
||||
[dependencies]
|
||||
bytemuck.workspace = true
|
||||
base = { path = "../base" }
|
||||
|
||||
log.workspace = true
|
||||
memmap2.workspace = true
|
||||
rand.workspace = true
|
||||
rustix.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
|
33
crates/common/src/json.rs
Normal file
33
crates/common/src/json.rs
Normal file
@ -0,0 +1,33 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{ops::Deref, path::Path};
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct Json<T>(pub T);
|
||||
|
||||
impl<T: Serialize> Json<T> {
|
||||
pub fn create(path: impl AsRef<Path>, x: T) -> Self {
|
||||
std::fs::write(path, serde_json::to_string(&x).unwrap()).unwrap();
|
||||
Self(x)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: for<'a> Deserialize<'a>> Json<T> {
|
||||
pub fn open(path: impl AsRef<Path>) -> Self {
|
||||
let x = serde_json::from_slice(&std::fs::read(path).unwrap()).unwrap();
|
||||
Self(x)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> AsRef<T> for Json<T> {
|
||||
fn as_ref(&self) -> &T {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Deref for Json<T> {
|
||||
type Target = T;
|
||||
|
||||
fn deref(&self) -> &T {
|
||||
&self.0
|
||||
}
|
||||
}
|
@ -1,5 +1,11 @@
|
||||
pub mod clean;
|
||||
pub mod dir_ops;
|
||||
pub mod file_atomic;
|
||||
pub mod json;
|
||||
pub mod mmap_array;
|
||||
pub mod rand;
|
||||
pub mod remap;
|
||||
pub mod sample;
|
||||
pub mod variants;
|
||||
pub mod vec2;
|
||||
pub mod vec3;
|
||||
|
@ -1,4 +1,4 @@
|
||||
use bytemuck::{Pod, Zeroable};
|
||||
use base::pod::Pod;
|
||||
use std::fs::File;
|
||||
use std::io::{BufWriter, Read, Seek, Write};
|
||||
use std::ops::Index;
|
||||
@ -11,11 +11,14 @@ pub struct MmapArray<T> {
|
||||
_mmap: memmap2::Mmap,
|
||||
}
|
||||
|
||||
unsafe impl<T: Send + Sync> Send for MmapArray<T> {}
|
||||
unsafe impl<T: Send + Sync> Sync for MmapArray<T> {}
|
||||
|
||||
impl<T> MmapArray<T>
|
||||
where
|
||||
T: Pod,
|
||||
{
|
||||
pub fn create<I>(path: &Path, iter: I) -> Self
|
||||
pub fn create<I>(path: impl AsRef<Path>, iter: I) -> Self
|
||||
where
|
||||
I: Iterator<Item = T>,
|
||||
{
|
||||
@ -28,11 +31,11 @@ where
|
||||
let mut info = Information { len: 0 };
|
||||
let mut buffered = BufWriter::new(&file);
|
||||
for data in iter {
|
||||
buffered.write_all(bytemuck::bytes_of(&data)).unwrap();
|
||||
buffered.write_all(base::pod::bytes_of(&data)).unwrap();
|
||||
info.len += 1;
|
||||
}
|
||||
buffered.write_all(&[0u8; 4096]).unwrap();
|
||||
buffered.write_all(bytemuck::bytes_of(&info)).unwrap();
|
||||
buffered.write_all(base::pod::bytes_of(&info)).unwrap();
|
||||
buffered.flush().unwrap();
|
||||
file.sync_all().unwrap();
|
||||
let mmap = unsafe { read_mmap(&file, info.len * std::mem::size_of::<T>()) };
|
||||
@ -43,7 +46,7 @@ where
|
||||
_mmap: mmap,
|
||||
}
|
||||
}
|
||||
pub fn open(path: &Path) -> Self {
|
||||
pub fn open(path: impl AsRef<Path>) -> Self {
|
||||
let file = std::fs::OpenOptions::new().read(true).open(path).unwrap();
|
||||
let info = read_information(&file);
|
||||
let mmap = unsafe { read_mmap(&file, info.len * std::mem::size_of::<T>()) };
|
||||
@ -100,7 +103,6 @@ struct Information {
|
||||
len: usize,
|
||||
}
|
||||
|
||||
unsafe impl Zeroable for Information {}
|
||||
unsafe impl Pod for Information {}
|
||||
|
||||
fn read_information(mut file: &File) -> Information {
|
||||
@ -108,7 +110,7 @@ fn read_information(mut file: &File) -> Information {
|
||||
file.seek(std::io::SeekFrom::End(-(size as i64))).unwrap();
|
||||
let mut buff = vec![0u8; size];
|
||||
file.read_exact(&mut buff).unwrap();
|
||||
bytemuck::try_pod_read_unaligned::<Information>(&buff).unwrap()
|
||||
base::pod::try_pod_read_unaligned::<Information>(&buff)
|
||||
}
|
||||
|
||||
unsafe fn read_mmap(file: &File, len: usize) -> memmap2::Mmap {
|
||||
|
11
crates/common/src/rand.rs
Normal file
11
crates/common/src/rand.rs
Normal file
@ -0,0 +1,11 @@
|
||||
use rand::Rng;
|
||||
|
||||
pub fn sample_u32<R>(rng: &mut R, length: u32, amount: u32) -> Vec<u32>
|
||||
where
|
||||
R: Rng + ?Sized,
|
||||
{
|
||||
match rand::seq::index::sample(rng, length as usize, amount as usize) {
|
||||
rand::seq::index::IndexVec::U32(x) => x,
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
101
crates/common/src/remap.rs
Normal file
101
crates/common/src/remap.rs
Normal file
@ -0,0 +1,101 @@
|
||||
use base::operator::*;
|
||||
use base::search::*;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
pub fn remap(
|
||||
/* source.len() */ n: u32,
|
||||
/* main.len() */ m: u32,
|
||||
check_existing: impl Fn(u32) -> bool,
|
||||
) -> Vec<u32> {
|
||||
let mut remap = vec![0u32; m as usize];
|
||||
let mut holes = vec![];
|
||||
// delete main points, leaving holes
|
||||
for i in 0..m {
|
||||
if check_existing(i) {
|
||||
remap[i as usize] = i;
|
||||
} else {
|
||||
holes.push(i);
|
||||
}
|
||||
}
|
||||
holes.reverse();
|
||||
// insert new points, filling holes
|
||||
for i in m..n {
|
||||
if check_existing(i) {
|
||||
if let Some(x) = holes.pop() {
|
||||
remap[x as usize] = i;
|
||||
} else {
|
||||
remap.push(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
holes.reverse();
|
||||
// fill holes (only if there are more deleted points than inserted points)
|
||||
while let Some(x) = holes.pop() {
|
||||
remap.swap_remove(x as usize);
|
||||
}
|
||||
remap
|
||||
}
|
||||
|
||||
pub struct RemappedCollection<'a, O: Operator, C: Collection<O>> {
|
||||
collection: &'a C,
|
||||
remap: Vec<u32>,
|
||||
barrier: u32,
|
||||
_phantom: PhantomData<fn(O) -> O>,
|
||||
}
|
||||
|
||||
impl<'a, O: Operator, S: Source<O>> RemappedCollection<'a, O, S> {
|
||||
pub fn from_source(source: &'a S) -> Self {
|
||||
let barrier = source.get_main_len();
|
||||
let remap = remap(source.len(), barrier, |i| source.check_existing(i));
|
||||
Self {
|
||||
collection: source,
|
||||
remap,
|
||||
barrier,
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, O: Operator, C: Collection<O>> RemappedCollection<'a, O, C> {
|
||||
pub fn from_collection(collection: &'a C, remap: Vec<u32>) -> Self {
|
||||
assert_eq!(remap.len(), collection.len() as usize);
|
||||
let barrier = collection.len();
|
||||
Self {
|
||||
collection,
|
||||
remap,
|
||||
barrier,
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, O: Operator, C: Collection<O>> RemappedCollection<'a, O, C> {
|
||||
#[inline(always)]
|
||||
pub fn skip(&self, x: u32) -> bool {
|
||||
x < self.barrier && (x as usize) < self.remap.len() && self.remap[x as usize] == x
|
||||
}
|
||||
#[inline(always)]
|
||||
pub fn barrier(&self) -> u32 {
|
||||
self.barrier
|
||||
}
|
||||
}
|
||||
|
||||
impl<O: Operator, C: Collection<O>> Vectors<O> for RemappedCollection<'_, O, C> {
|
||||
fn dims(&self) -> u32 {
|
||||
self.collection.dims()
|
||||
}
|
||||
|
||||
fn len(&self) -> u32 {
|
||||
self.remap.len() as u32
|
||||
}
|
||||
|
||||
fn vector(&self, i: u32) -> Borrowed<'_, O> {
|
||||
self.collection.vector(self.remap[i as usize])
|
||||
}
|
||||
}
|
||||
|
||||
impl<O: Operator, C: Collection<O>> Collection<O> for RemappedCollection<'_, O, C> {
|
||||
fn payload(&self, i: u32) -> Payload {
|
||||
self.collection.payload(self.remap[i as usize])
|
||||
}
|
||||
}
|
51
crates/common/src/sample.rs
Normal file
51
crates/common/src/sample.rs
Normal file
@ -0,0 +1,51 @@
|
||||
use crate::vec2::Vec2;
|
||||
use base::operator::{Operator, Scalar};
|
||||
use base::search::Vectors;
|
||||
use base::vector::VectorBorrowed;
|
||||
|
||||
const SAMPLES: usize = 65536;
|
||||
|
||||
pub fn sample<O: Operator>(vectors: &impl Vectors<O>) -> Vec2<Scalar<O>> {
|
||||
let n = vectors.len();
|
||||
let m = std::cmp::min(SAMPLES as u32, n);
|
||||
let f = super::rand::sample_u32(&mut rand::thread_rng(), n, m);
|
||||
let mut samples = Vec2::new(vectors.dims(), m as usize);
|
||||
for i in 0..m {
|
||||
let v = vectors.vector(f[i as usize] as u32).to_vec();
|
||||
samples[i as usize].copy_from_slice(&v);
|
||||
}
|
||||
samples
|
||||
}
|
||||
|
||||
pub fn sample_subvector<O: Operator>(
|
||||
vectors: &impl Vectors<O>,
|
||||
s: usize,
|
||||
e: usize,
|
||||
) -> Vec2<Scalar<O>> {
|
||||
let n = vectors.len();
|
||||
let m = std::cmp::min(SAMPLES as u32, n);
|
||||
let f = super::rand::sample_u32(&mut rand::thread_rng(), n, m);
|
||||
let mut samples = Vec2::new((e - s) as u32, m as usize);
|
||||
for i in 0..m {
|
||||
let v = vectors.vector(f[i as usize] as u32).to_vec();
|
||||
samples[i as usize].copy_from_slice(&v[s..e]);
|
||||
}
|
||||
samples
|
||||
}
|
||||
|
||||
pub fn sample_subvector_transform<O: Operator>(
|
||||
vectors: &impl Vectors<O>,
|
||||
s: usize,
|
||||
e: usize,
|
||||
transform: impl Fn(&mut [Scalar<O>]) -> &[Scalar<O>],
|
||||
) -> Vec2<Scalar<O>> {
|
||||
let n = vectors.len();
|
||||
let m = std::cmp::min(SAMPLES as u32, n);
|
||||
let f = super::rand::sample_u32(&mut rand::thread_rng(), n, m);
|
||||
let mut samples = Vec2::new((e - s) as u32, m as usize);
|
||||
for i in 0..m {
|
||||
let mut v = vectors.vector(f[i as usize] as u32).to_vec();
|
||||
samples[i as usize].copy_from_slice(transform(&mut v));
|
||||
}
|
||||
samples
|
||||
}
|
23
crates/common/src/variants.rs
Normal file
23
crates/common/src/variants.rs
Normal file
@ -0,0 +1,23 @@
|
||||
use std::path::Path;
|
||||
|
||||
pub fn variants<const N: usize>(path: impl AsRef<Path>, variants: [&str; N]) -> &str {
|
||||
let dir = std::fs::read_dir(path).expect("failed to read dir");
|
||||
let files = dir
|
||||
.collect::<Result<Vec<_>, _>>()
|
||||
.expect("failed to walk dir");
|
||||
let mut matches = vec![];
|
||||
for file in files {
|
||||
for name in variants {
|
||||
if file.file_name() == *name {
|
||||
matches.push(name);
|
||||
}
|
||||
}
|
||||
}
|
||||
if matches.len() > 1 {
|
||||
panic!("too many matches");
|
||||
}
|
||||
if matches.is_empty() {
|
||||
panic!("no matches");
|
||||
}
|
||||
matches[0]
|
||||
}
|
@ -1,17 +1,18 @@
|
||||
use bytemuck::Zeroable;
|
||||
use base::pod::Pod;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::ops::{Deref, DerefMut, Index, IndexMut};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Vec2<T> {
|
||||
dims: u32,
|
||||
v: Vec<T>,
|
||||
}
|
||||
|
||||
impl<T: Zeroable + Ord> Vec2<T> {
|
||||
impl<T: Pod + Ord> Vec2<T> {
|
||||
pub fn new(dims: u32, n: usize) -> Self {
|
||||
Self {
|
||||
dims,
|
||||
v: bytemuck::zeroed_vec(dims as usize * n),
|
||||
v: base::pod::zeroed_vec(dims as usize * n),
|
||||
}
|
||||
}
|
||||
pub fn dims(&self) -> u32 {
|
||||
|
101
crates/common/src/vec3.rs
Normal file
101
crates/common/src/vec3.rs
Normal file
@ -0,0 +1,101 @@
|
||||
use base::pod::Pod;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::ops::{Deref, DerefMut, Index, IndexMut};
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Vec3<T> {
|
||||
x: usize,
|
||||
y: usize,
|
||||
z: usize,
|
||||
v: Vec<T>,
|
||||
}
|
||||
|
||||
impl<T: Pod + Ord> Vec3<T> {
|
||||
pub fn new(x: usize, y: usize, z: usize) -> Self {
|
||||
Self {
|
||||
x,
|
||||
y,
|
||||
z,
|
||||
v: base::pod::zeroed_vec(x * y * z),
|
||||
}
|
||||
}
|
||||
pub fn x(&self) -> usize {
|
||||
self.x
|
||||
}
|
||||
pub fn y(&self) -> usize {
|
||||
self.y
|
||||
}
|
||||
pub fn z(&self) -> usize {
|
||||
self.z
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Index<()> for Vec3<T> {
|
||||
type Output = [T];
|
||||
|
||||
fn index(&self, (): ()) -> &Self::Output {
|
||||
&self.v[..]
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> IndexMut<()> for Vec3<T> {
|
||||
fn index_mut(&mut self, (): ()) -> &mut Self::Output {
|
||||
&mut self.v[..]
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Index<(usize,)> for Vec3<T> {
|
||||
type Output = [T];
|
||||
|
||||
fn index(&self, (x,): (usize,)) -> &Self::Output {
|
||||
&self.v[x * self.y * self.z..][..self.y * self.z]
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> IndexMut<(usize,)> for Vec3<T> {
|
||||
fn index_mut(&mut self, (x,): (usize,)) -> &mut Self::Output {
|
||||
&mut self.v[x * self.y * self.z..][..self.y * self.z]
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Index<(usize, usize)> for Vec3<T> {
|
||||
type Output = [T];
|
||||
|
||||
fn index(&self, (x, y): (usize, usize)) -> &Self::Output {
|
||||
&self.v[x * self.y * self.z + y * self.z..][..self.z]
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> IndexMut<(usize, usize)> for Vec3<T> {
|
||||
fn index_mut(&mut self, (x, y): (usize, usize)) -> &mut Self::Output {
|
||||
&mut self.v[x * self.y * self.z + y * self.z..][..self.z]
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Index<(usize, usize, usize)> for Vec3<T> {
|
||||
type Output = T;
|
||||
|
||||
fn index(&self, (x, y, z): (usize, usize, usize)) -> &Self::Output {
|
||||
&self.v[x * self.y * self.z + y * self.z + z]
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> IndexMut<(usize, usize, usize)> for Vec3<T> {
|
||||
fn index_mut(&mut self, (x, y, z): (usize, usize, usize)) -> &mut Self::Output {
|
||||
&mut self.v[x * self.y * self.z + y * self.z + z]
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Deref for Vec3<T> {
|
||||
type Target = [T];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.v.deref()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> DerefMut for Vec3<T> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
self.v.deref_mut()
|
||||
}
|
||||
}
|
@ -1,13 +1,10 @@
|
||||
/// Function multiversioning attribute macros for `pgvecto.rs`.
|
||||
///
|
||||
/// ```no_run
|
||||
/// #![feature(doc_cfg)]
|
||||
///
|
||||
/// #[cfg(any(target_arch = "x86_64", doc))]
|
||||
/// #[doc(cfg(target_arch = "x86_64"))]
|
||||
/// #[cfg(target_arch = "x86_64")]
|
||||
/// #[detect::target_cpu(enable = "v3")]
|
||||
/// unsafe fn g_v3(x: &[u32]) -> u32 {
|
||||
/// todo!()
|
||||
/// unimplemented!()
|
||||
/// }
|
||||
///
|
||||
/// #[cfg(all(target_arch = "x86_64", test))]
|
||||
|
@ -187,8 +187,7 @@ pub fn multiversion(
|
||||
let target_arch = list.target_arch;
|
||||
let target_features = list.target_features;
|
||||
port = quote::quote! {
|
||||
#[cfg(any(target_arch = #target_arch, doc))]
|
||||
#[doc(cfg(target_arch = #target_arch))]
|
||||
#[cfg(any(target_arch = #target_arch))]
|
||||
#[target_feature(enable = #target_features)]
|
||||
unsafe fn #name < #generics_params > (#inputs) #output #generics_where { #block }
|
||||
};
|
||||
|
@ -4,13 +4,12 @@ version.workspace = true
|
||||
edition.workspace = true
|
||||
|
||||
[dependencies]
|
||||
bytemuck.workspace = true
|
||||
num-traits.workspace = true
|
||||
rand.workspace = true
|
||||
|
||||
base = { path = "../base" }
|
||||
common = { path = "../common" }
|
||||
rayon = { path = "../rayon" }
|
||||
stoppable_rayon = { path = "../stoppable_rayon" }
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
@ -5,18 +5,82 @@ pub mod operator;
|
||||
use crate::operator::OperatorElkanKMeans;
|
||||
use base::operator::*;
|
||||
use base::scalar::*;
|
||||
use base::vector::VectorBorrowed;
|
||||
use common::vec2::Vec2;
|
||||
use num_traits::{Float, Zero};
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use rayon::iter::{IndexedParallelIterator, IntoParallelRefMutIterator, ParallelIterator};
|
||||
use rayon::slice::ParallelSliceMut;
|
||||
use std::ops::{Index, IndexMut};
|
||||
use stoppable_rayon as rayon;
|
||||
|
||||
pub struct ElkanKMeans<O: Operator> {
|
||||
const ITERATIONS: usize = 100;
|
||||
|
||||
pub fn elkan_k_means<O: OperatorElkanKMeans>(
|
||||
c: usize,
|
||||
mut samples: Vec2<Scalar<O>>,
|
||||
) -> Vec2<Scalar<O>> {
|
||||
assert!(c > 0);
|
||||
for i in 0..samples.len() {
|
||||
O::elkan_k_means_normalize(&mut samples[i]);
|
||||
}
|
||||
let mut elkan_k_means = ElkanKMeans::<O>::new(c, samples);
|
||||
for _ in 0..ITERATIONS {
|
||||
rayon::check();
|
||||
if elkan_k_means.iterate() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
elkan_k_means.finish()
|
||||
}
|
||||
|
||||
pub fn elkan_k_means_lookup<O: OperatorElkanKMeans>(
|
||||
vector: Borrowed<O>,
|
||||
centroids: &Vec2<Scalar<O>>,
|
||||
) -> usize {
|
||||
assert!(!centroids.is_empty());
|
||||
let mut vector = vector.to_vec();
|
||||
O::elkan_k_means_normalize(&mut vector);
|
||||
let mut result = (F32::infinity(), 0);
|
||||
for i in 0..centroids.len() {
|
||||
let dis = O::elkan_k_means_distance(&vector, ¢roids[i]);
|
||||
result = std::cmp::min(result, (dis, i));
|
||||
}
|
||||
result.1
|
||||
}
|
||||
|
||||
pub fn elkan_k_means_lookup_dense<O: OperatorElkanKMeans>(
|
||||
mut vector: Vec<Scalar<O>>,
|
||||
centroids: &Vec2<Scalar<O>>,
|
||||
) -> usize {
|
||||
assert!(!centroids.is_empty());
|
||||
O::elkan_k_means_normalize(&mut vector);
|
||||
let mut result = (F32::infinity(), 0);
|
||||
for i in 0..centroids.len() {
|
||||
let dis = O::elkan_k_means_distance(&vector, ¢roids[i]);
|
||||
result = std::cmp::min(result, (dis, i));
|
||||
}
|
||||
result.1
|
||||
}
|
||||
|
||||
pub fn elkan_k_means_caluate<O: OperatorElkanKMeans>(
|
||||
vector: Borrowed<O>,
|
||||
centroids: &Vec2<Scalar<O>>,
|
||||
) -> Vec<(F32, usize)> {
|
||||
assert!(!centroids.is_empty());
|
||||
let mut vector = vector.to_vec();
|
||||
O::elkan_k_means_normalize(&mut vector);
|
||||
let mut seq = Vec::new();
|
||||
for i in 0..centroids.len() {
|
||||
let dis = O::elkan_k_means_distance(&vector, ¢roids[i]);
|
||||
seq.push((dis, i));
|
||||
}
|
||||
seq
|
||||
}
|
||||
|
||||
struct ElkanKMeans<O: Operator> {
|
||||
dims: u32,
|
||||
c: usize,
|
||||
pub centroids: Vec2<Scalar<O>>,
|
||||
centroids: Vec2<Scalar<O>>,
|
||||
lowerbound: Square,
|
||||
upperbound: Vec<F32>,
|
||||
assign: Vec<usize>,
|
||||
@ -27,7 +91,7 @@ pub struct ElkanKMeans<O: Operator> {
|
||||
const DELTA: f32 = 1.0 / 1024.0;
|
||||
|
||||
impl<O: OperatorElkanKMeans> ElkanKMeans<O> {
|
||||
pub fn new(c: usize, samples: Vec2<Scalar<O>>) -> Self {
|
||||
fn new(c: usize, samples: Vec2<Scalar<O>>) -> Self {
|
||||
let n = samples.len();
|
||||
let dims = samples.dims();
|
||||
|
||||
@ -43,9 +107,9 @@ impl<O: OperatorElkanKMeans> ElkanKMeans<O> {
|
||||
let mut dis = vec![F32::zero(); n];
|
||||
for i in 0..c {
|
||||
let mut sum = F32::zero();
|
||||
dis.par_iter_mut().enumerate().for_each(|(j, x)| {
|
||||
*x = O::elkan_k_means_distance(&samples[j], ¢roids[i]);
|
||||
});
|
||||
for j in 0..n {
|
||||
dis[j] = O::elkan_k_means_distance(&samples[j], ¢roids[i]);
|
||||
}
|
||||
for j in 0..n {
|
||||
lowerbound[(j, i)] = dis[j];
|
||||
if dis[j] * dis[j] < weight[j] {
|
||||
@ -95,37 +159,7 @@ impl<O: OperatorElkanKMeans> ElkanKMeans<O> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Quick approach if we have little data
|
||||
fn quick_centroids(&mut self) -> bool {
|
||||
let c = self.c;
|
||||
let samples = &self.samples;
|
||||
let rand = &mut self.rand;
|
||||
let centroids = &mut self.centroids;
|
||||
let n = samples.len();
|
||||
let dims = samples.dims();
|
||||
let sorted_index = samples.argsort();
|
||||
for i in 0..n {
|
||||
let index = sorted_index.get(i).unwrap();
|
||||
let last = sorted_index.get(std::cmp::max(i, 1) - 1).unwrap();
|
||||
if *index == 0 || samples[*last] != samples[*index] {
|
||||
centroids[i].copy_from_slice(&samples[*index]);
|
||||
} else {
|
||||
let rand_centroids: Vec<_> = (0..dims)
|
||||
.map(|_| Scalar::<O>::from_f32(rand.gen_range(0.0..1.0f32)))
|
||||
.collect();
|
||||
centroids[i].copy_from_slice(rand_centroids.as_slice());
|
||||
}
|
||||
}
|
||||
for i in n..c {
|
||||
let rand_centroids: Vec<_> = (0..dims)
|
||||
.map(|_| Scalar::<O>::from_f32(rand.gen_range(0.0..1.0f32)))
|
||||
.collect();
|
||||
centroids[i].copy_from_slice(rand_centroids.as_slice());
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
pub fn iterate(&mut self) -> bool {
|
||||
fn iterate(&mut self) -> bool {
|
||||
let c = self.c;
|
||||
let dims = self.dims;
|
||||
let samples = &self.samples;
|
||||
@ -137,22 +171,40 @@ impl<O: OperatorElkanKMeans> ElkanKMeans<O> {
|
||||
let mut change = 0;
|
||||
let n = samples.len();
|
||||
if n <= c {
|
||||
return self.quick_centroids();
|
||||
let c = self.c;
|
||||
let samples = &self.samples;
|
||||
let rand = &mut self.rand;
|
||||
let centroids = &mut self.centroids;
|
||||
let n = samples.len();
|
||||
let dims = samples.dims();
|
||||
let sorted_index = samples.argsort();
|
||||
for i in 0..n {
|
||||
let index = sorted_index.get(i).unwrap();
|
||||
let last = sorted_index.get(std::cmp::max(i, 1) - 1).unwrap();
|
||||
if *index == 0 || samples[*last] != samples[*index] {
|
||||
centroids[i].copy_from_slice(&samples[*index]);
|
||||
} else {
|
||||
let rand_centroids: Vec<_> = (0..dims)
|
||||
.map(|_| Scalar::<O>::from_f32(rand.gen_range(0.0..1.0f32)))
|
||||
.collect();
|
||||
centroids[i].copy_from_slice(rand_centroids.as_slice());
|
||||
}
|
||||
}
|
||||
for i in n..c {
|
||||
let rand_centroids: Vec<_> = (0..dims)
|
||||
.map(|_| Scalar::<O>::from_f32(rand.gen_range(0.0..1.0f32)))
|
||||
.collect();
|
||||
centroids[i].copy_from_slice(rand_centroids.as_slice());
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Step 1
|
||||
let mut dist0 = Square::new(c, c);
|
||||
let mut sp = vec![F32::zero(); c];
|
||||
dist0.v.par_iter_mut().enumerate().for_each(|(ii, v)| {
|
||||
let i = ii / c;
|
||||
let j = ii % c;
|
||||
if i <= j {
|
||||
*v = O::elkan_k_means_distance(¢roids[i], ¢roids[j]) * 0.5;
|
||||
}
|
||||
});
|
||||
for i in 1..c {
|
||||
for j in 0..i - 1 {
|
||||
dist0[(i, j)] = dist0[(j, i)];
|
||||
for i in 0..c {
|
||||
for j in 0..c {
|
||||
dist0[(i, j)] = O::elkan_k_means_distance(¢roids[i], ¢roids[j]) * 0.5;
|
||||
}
|
||||
}
|
||||
for i in 0..c {
|
||||
@ -168,13 +220,12 @@ impl<O: OperatorElkanKMeans> ElkanKMeans<O> {
|
||||
}
|
||||
sp[i] = minimal;
|
||||
}
|
||||
|
||||
let mut dis = vec![F32::zero(); n];
|
||||
dis.par_iter_mut().enumerate().for_each(|(i, x)| {
|
||||
for i in 0..n {
|
||||
if upperbound[i] > sp[assign[i]] {
|
||||
*x = O::elkan_k_means_distance(&samples[i], ¢roids[assign[i]]);
|
||||
dis[i] = O::elkan_k_means_distance(&samples[i], ¢roids[assign[i]]);
|
||||
}
|
||||
});
|
||||
}
|
||||
for i in 0..n {
|
||||
// Step 2
|
||||
if upperbound[i] <= sp[assign[i]] {
|
||||
@ -251,15 +302,15 @@ impl<O: OperatorElkanKMeans> ElkanKMeans<O> {
|
||||
count[i] = count[o] / 2.0;
|
||||
count[o] = count[o] - count[i];
|
||||
}
|
||||
centroids.par_chunks_mut(dims as usize).for_each(|v| {
|
||||
O::elkan_k_means_normalize(v);
|
||||
});
|
||||
for i in 0..c {
|
||||
O::elkan_k_means_normalize(&mut centroids[i]);
|
||||
}
|
||||
|
||||
// Step 5, 6
|
||||
let mut dist1 = vec![F32::zero(); c];
|
||||
dist1.par_iter_mut().enumerate().for_each(|(i, v)| {
|
||||
*v = O::elkan_k_means_distance(&old[i], ¢roids[i]);
|
||||
});
|
||||
for i in 0..c {
|
||||
dist1[i] = O::elkan_k_means_distance(&old[i], ¢roids[i]);
|
||||
}
|
||||
for i in 0..n {
|
||||
for j in 0..c {
|
||||
self.lowerbound[(i, j)] =
|
||||
@ -273,12 +324,12 @@ impl<O: OperatorElkanKMeans> ElkanKMeans<O> {
|
||||
change == 0
|
||||
}
|
||||
|
||||
pub fn finish(self) -> Vec2<Scalar<O>> {
|
||||
fn finish(self) -> Vec2<Scalar<O>> {
|
||||
self.centroids
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Square {
|
||||
struct Square {
|
||||
x: usize,
|
||||
y: usize,
|
||||
v: Vec<F32>,
|
||||
@ -289,7 +340,7 @@ impl Square {
|
||||
Self {
|
||||
x,
|
||||
y,
|
||||
v: bytemuck::zeroed_vec(x * y),
|
||||
v: base::pod::zeroed_vec(x * y),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -4,15 +4,11 @@ use base::vector::*;
|
||||
use num_traits::Float;
|
||||
|
||||
pub trait OperatorElkanKMeans: Operator {
|
||||
type VectorNormalized: VectorOwned;
|
||||
|
||||
fn elkan_k_means_normalize(vector: &mut [Scalar<Self>]);
|
||||
fn elkan_k_means_distance(lhs: &[Scalar<Self>], rhs: &[Scalar<Self>]) -> F32;
|
||||
}
|
||||
|
||||
impl OperatorElkanKMeans for BVecf32Cos {
|
||||
type VectorNormalized = Vecf32Owned;
|
||||
|
||||
fn elkan_k_means_normalize(vector: &mut [Scalar<Self>]) {
|
||||
vecf32::l2_normalize(vector)
|
||||
}
|
||||
@ -23,8 +19,6 @@ impl OperatorElkanKMeans for BVecf32Cos {
|
||||
}
|
||||
|
||||
impl OperatorElkanKMeans for BVecf32Dot {
|
||||
type VectorNormalized = Vecf32Owned;
|
||||
|
||||
fn elkan_k_means_normalize(vector: &mut [Scalar<Self>]) {
|
||||
vecf32::l2_normalize(vector)
|
||||
}
|
||||
@ -35,8 +29,6 @@ impl OperatorElkanKMeans for BVecf32Dot {
|
||||
}
|
||||
|
||||
impl OperatorElkanKMeans for BVecf32Jaccard {
|
||||
type VectorNormalized = Vecf32Owned;
|
||||
|
||||
fn elkan_k_means_normalize(vector: &mut [Scalar<Self>]) {
|
||||
vecf32::l2_normalize(vector)
|
||||
}
|
||||
@ -47,8 +39,6 @@ impl OperatorElkanKMeans for BVecf32Jaccard {
|
||||
}
|
||||
|
||||
impl OperatorElkanKMeans for BVecf32L2 {
|
||||
type VectorNormalized = Vecf32Owned;
|
||||
|
||||
fn elkan_k_means_normalize(_: &mut [Scalar<Self>]) {}
|
||||
|
||||
fn elkan_k_means_distance(lhs: &[Scalar<Self>], rhs: &[Scalar<Self>]) -> F32 {
|
||||
@ -57,8 +47,6 @@ impl OperatorElkanKMeans for BVecf32L2 {
|
||||
}
|
||||
|
||||
impl OperatorElkanKMeans for SVecf32Cos {
|
||||
type VectorNormalized = Self::VectorOwned;
|
||||
|
||||
fn elkan_k_means_normalize(vector: &mut [Scalar<Self>]) {
|
||||
vecf32::l2_normalize(vector)
|
||||
}
|
||||
@ -69,8 +57,6 @@ impl OperatorElkanKMeans for SVecf32Cos {
|
||||
}
|
||||
|
||||
impl OperatorElkanKMeans for SVecf32Dot {
|
||||
type VectorNormalized = Self::VectorOwned;
|
||||
|
||||
fn elkan_k_means_normalize(vector: &mut [Scalar<Self>]) {
|
||||
vecf32::l2_normalize(vector)
|
||||
}
|
||||
@ -81,8 +67,6 @@ impl OperatorElkanKMeans for SVecf32Dot {
|
||||
}
|
||||
|
||||
impl OperatorElkanKMeans for SVecf32L2 {
|
||||
type VectorNormalized = Self::VectorOwned;
|
||||
|
||||
fn elkan_k_means_normalize(_: &mut [Scalar<Self>]) {}
|
||||
|
||||
fn elkan_k_means_distance(lhs: &[Scalar<Self>], rhs: &[Scalar<Self>]) -> F32 {
|
||||
@ -91,19 +75,16 @@ impl OperatorElkanKMeans for SVecf32L2 {
|
||||
}
|
||||
|
||||
impl OperatorElkanKMeans for Vecf16Cos {
|
||||
type VectorNormalized = Self::VectorOwned;
|
||||
|
||||
fn elkan_k_means_normalize(vector: &mut [Scalar<Self>]) {
|
||||
vecf16::l2_normalize(vector)
|
||||
}
|
||||
|
||||
fn elkan_k_means_distance(lhs: &[Scalar<Self>], rhs: &[Scalar<Self>]) -> F32 {
|
||||
vecf16::dot(lhs, rhs).acos()
|
||||
}
|
||||
}
|
||||
|
||||
impl OperatorElkanKMeans for Vecf16Dot {
|
||||
type VectorNormalized = Self::VectorOwned;
|
||||
|
||||
fn elkan_k_means_normalize(vector: &mut [Scalar<Self>]) {
|
||||
vecf16::l2_normalize(vector)
|
||||
}
|
||||
@ -114,8 +95,6 @@ impl OperatorElkanKMeans for Vecf16Dot {
|
||||
}
|
||||
|
||||
impl OperatorElkanKMeans for Vecf16L2 {
|
||||
type VectorNormalized = Self::VectorOwned;
|
||||
|
||||
fn elkan_k_means_normalize(_: &mut [Scalar<Self>]) {}
|
||||
|
||||
fn elkan_k_means_distance(lhs: &[Scalar<Self>], rhs: &[Scalar<Self>]) -> F32 {
|
||||
@ -124,8 +103,6 @@ impl OperatorElkanKMeans for Vecf16L2 {
|
||||
}
|
||||
|
||||
impl OperatorElkanKMeans for Vecf32Cos {
|
||||
type VectorNormalized = Self::VectorOwned;
|
||||
|
||||
fn elkan_k_means_normalize(vector: &mut [Scalar<Self>]) {
|
||||
vecf32::l2_normalize(vector)
|
||||
}
|
||||
@ -136,8 +113,6 @@ impl OperatorElkanKMeans for Vecf32Cos {
|
||||
}
|
||||
|
||||
impl OperatorElkanKMeans for Vecf32Dot {
|
||||
type VectorNormalized = Self::VectorOwned;
|
||||
|
||||
fn elkan_k_means_normalize(vector: &mut [Scalar<Self>]) {
|
||||
vecf32::l2_normalize(vector)
|
||||
}
|
||||
@ -148,8 +123,6 @@ impl OperatorElkanKMeans for Vecf32Dot {
|
||||
}
|
||||
|
||||
impl OperatorElkanKMeans for Vecf32L2 {
|
||||
type VectorNormalized = Self::VectorOwned;
|
||||
|
||||
fn elkan_k_means_normalize(_: &mut [Scalar<Self>]) {}
|
||||
|
||||
fn elkan_k_means_distance(lhs: &[Scalar<Self>], rhs: &[Scalar<Self>]) -> F32 {
|
||||
@ -158,8 +131,6 @@ impl OperatorElkanKMeans for Vecf32L2 {
|
||||
}
|
||||
|
||||
impl OperatorElkanKMeans for Veci8Cos {
|
||||
type VectorNormalized = Self::VectorOwned;
|
||||
|
||||
fn elkan_k_means_normalize(vector: &mut [Scalar<Self>]) {
|
||||
vecf32::l2_normalize(vector)
|
||||
}
|
||||
@ -170,8 +141,6 @@ impl OperatorElkanKMeans for Veci8Cos {
|
||||
}
|
||||
|
||||
impl OperatorElkanKMeans for Veci8Dot {
|
||||
type VectorNormalized = Self::VectorOwned;
|
||||
|
||||
fn elkan_k_means_normalize(vector: &mut [Scalar<Self>]) {
|
||||
vecf32::l2_normalize(vector)
|
||||
}
|
||||
@ -182,8 +151,6 @@ impl OperatorElkanKMeans for Veci8Dot {
|
||||
}
|
||||
|
||||
impl OperatorElkanKMeans for Veci8L2 {
|
||||
type VectorNormalized = Self::VectorOwned;
|
||||
|
||||
fn elkan_k_means_normalize(_: &mut [Scalar<Self>]) {}
|
||||
|
||||
fn elkan_k_means_distance(lhs: &[Scalar<Self>], rhs: &[Scalar<Self>]) -> F32 {
|
||||
|
@ -7,7 +7,7 @@ edition.workspace = true
|
||||
base = { path = "../base" }
|
||||
common = { path = "../common" }
|
||||
quantization = { path = "../quantization" }
|
||||
rayon = { path = "../rayon" }
|
||||
stoppable_rayon = { path = "../stoppable_rayon" }
|
||||
storage = { path = "../storage" }
|
||||
|
||||
[lints]
|
||||
|
@ -1,151 +1,114 @@
|
||||
#![feature(trait_alias)]
|
||||
#![allow(clippy::len_without_is_empty)]
|
||||
|
||||
use base::index::*;
|
||||
use base::operator::*;
|
||||
use base::search::*;
|
||||
use common::dir_ops::sync_dir;
|
||||
use common::mmap_array::MmapArray;
|
||||
use common::remap::RemappedCollection;
|
||||
use quantization::operator::OperatorQuantization;
|
||||
use quantization::Quantization;
|
||||
use std::cmp::Reverse;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::fs::create_dir;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use storage::operator::OperatorStorage;
|
||||
use storage::StorageCollection;
|
||||
use storage::OperatorStorage;
|
||||
use storage::Storage;
|
||||
|
||||
pub trait OperatorFlat = Operator + OperatorQuantization + OperatorStorage;
|
||||
pub trait OperatorFlat: Operator + OperatorQuantization + OperatorStorage {}
|
||||
|
||||
impl<T: Operator + OperatorQuantization + OperatorStorage> OperatorFlat for T {}
|
||||
|
||||
pub struct Flat<O: OperatorFlat> {
|
||||
mmap: FlatMmap<O>,
|
||||
storage: O::Storage,
|
||||
quantization: Quantization<O>,
|
||||
payloads: MmapArray<Payload>,
|
||||
}
|
||||
|
||||
impl<O: OperatorFlat> Flat<O> {
|
||||
pub fn create<S: Source<O>>(path: &Path, options: IndexOptions, source: &S) -> Self {
|
||||
create_dir(path).unwrap();
|
||||
let ram = make(path, options, source);
|
||||
let mmap = save(path, ram);
|
||||
sync_dir(path);
|
||||
Self { mmap }
|
||||
pub fn create(path: impl AsRef<Path>, options: IndexOptions, source: &impl Source<O>) -> Self {
|
||||
let remapped = RemappedCollection::from_source(source);
|
||||
from_nothing(path, options, &remapped)
|
||||
}
|
||||
|
||||
pub fn open(path: &Path, options: IndexOptions) -> Self {
|
||||
let mmap = open(path, options);
|
||||
Self { mmap }
|
||||
pub fn open(path: impl AsRef<Path>) -> Self {
|
||||
open(path)
|
||||
}
|
||||
|
||||
pub fn basic(
|
||||
&self,
|
||||
vector: Borrowed<'_, O>,
|
||||
_opts: &SearchOptions,
|
||||
_: &SearchOptions,
|
||||
) -> BinaryHeap<Reverse<Element>> {
|
||||
basic(&self.mmap, vector)
|
||||
let mut result = BinaryHeap::new();
|
||||
for i in 0..self.storage.len() {
|
||||
let distance = self.quantization.distance(&self.storage, vector, i);
|
||||
let payload = self.payloads[i as usize];
|
||||
result.push(Reverse(Element { distance, payload }));
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
pub fn vbase<'a>(
|
||||
&'a self,
|
||||
vector: Borrowed<'a, O>,
|
||||
_opts: &'a SearchOptions,
|
||||
_: &'a SearchOptions,
|
||||
) -> (Vec<Element>, Box<(dyn Iterator<Item = Element> + 'a)>) {
|
||||
vbase(&self.mmap, vector)
|
||||
let mut result = Vec::new();
|
||||
for i in 0..self.storage.len() {
|
||||
let distance = self.quantization.distance(&self.storage, vector, i);
|
||||
let payload = self.payloads[i as usize];
|
||||
result.push(Element { distance, payload });
|
||||
}
|
||||
(result, Box::new(std::iter::empty()))
|
||||
}
|
||||
|
||||
pub fn len(&self) -> u32 {
|
||||
self.mmap.storage.len()
|
||||
self.storage.len()
|
||||
}
|
||||
|
||||
pub fn vector(&self, i: u32) -> Borrowed<'_, O> {
|
||||
self.mmap.storage.vector(i)
|
||||
self.storage.vector(i)
|
||||
}
|
||||
|
||||
pub fn payload(&self, i: u32) -> Payload {
|
||||
self.mmap.storage.payload(i)
|
||||
self.payloads[i as usize]
|
||||
}
|
||||
}
|
||||
|
||||
unsafe impl<O: OperatorFlat> Send for Flat<O> {}
|
||||
unsafe impl<O: OperatorFlat> Sync for Flat<O> {}
|
||||
|
||||
pub struct FlatRam<O: OperatorFlat> {
|
||||
storage: Arc<StorageCollection<O>>,
|
||||
quantization: Quantization<O, StorageCollection<O>>,
|
||||
}
|
||||
|
||||
pub struct FlatMmap<O: OperatorFlat> {
|
||||
storage: Arc<StorageCollection<O>>,
|
||||
quantization: Quantization<O, StorageCollection<O>>,
|
||||
}
|
||||
|
||||
unsafe impl<O: OperatorFlat> Send for FlatMmap<O> {}
|
||||
unsafe impl<O: OperatorFlat> Sync for FlatMmap<O> {}
|
||||
|
||||
pub fn make<O: OperatorFlat, S: Source<O>>(
|
||||
path: &Path,
|
||||
fn from_nothing<O: OperatorFlat>(
|
||||
path: impl AsRef<Path>,
|
||||
options: IndexOptions,
|
||||
source: &S,
|
||||
) -> FlatRam<O> {
|
||||
let idx_opts = options.indexing.clone().unwrap_flat();
|
||||
let storage = Arc::new(StorageCollection::create(&path.join("raw"), source));
|
||||
collection: &impl Collection<O>,
|
||||
) -> Flat<O> {
|
||||
create_dir(path.as_ref()).unwrap();
|
||||
let flat_indexing_options = options.indexing.clone().unwrap_flat();
|
||||
let storage = O::Storage::create(path.as_ref().join("storage"), collection);
|
||||
let quantization = Quantization::create(
|
||||
&path.join("quantization"),
|
||||
path.as_ref().join("quantization"),
|
||||
options.clone(),
|
||||
idx_opts.quantization,
|
||||
&storage,
|
||||
(0..storage.len()).collect::<Vec<_>>(),
|
||||
flat_indexing_options.quantization,
|
||||
collection,
|
||||
);
|
||||
FlatRam {
|
||||
let payloads = MmapArray::create(
|
||||
path.as_ref().join("payloads"),
|
||||
(0..collection.len()).map(|i| collection.payload(i)),
|
||||
);
|
||||
sync_dir(path.as_ref());
|
||||
Flat {
|
||||
storage,
|
||||
quantization,
|
||||
payloads,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn save<O: OperatorFlat>(_: &Path, ram: FlatRam<O>) -> FlatMmap<O> {
|
||||
FlatMmap {
|
||||
storage: ram.storage,
|
||||
quantization: ram.quantization,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn open<O: OperatorFlat>(path: &Path, options: IndexOptions) -> FlatMmap<O> {
|
||||
let idx_opts = options.indexing.clone().unwrap_flat();
|
||||
let storage = Arc::new(StorageCollection::open(&path.join("raw"), options.clone()));
|
||||
rayon::check();
|
||||
let quantization = Quantization::open(
|
||||
&path.join("quantization"),
|
||||
options.clone(),
|
||||
idx_opts.quantization,
|
||||
&storage,
|
||||
);
|
||||
rayon::check();
|
||||
FlatMmap {
|
||||
fn open<O: OperatorFlat>(path: impl AsRef<Path>) -> Flat<O> {
|
||||
let storage = O::Storage::open(path.as_ref().join("storage"));
|
||||
let quantization = Quantization::open(path.as_ref().join("quantization"));
|
||||
let payloads = MmapArray::open(path.as_ref().join("payloads"));
|
||||
Flat {
|
||||
storage,
|
||||
quantization,
|
||||
payloads,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn basic<O: OperatorFlat>(
|
||||
mmap: &FlatMmap<O>,
|
||||
vector: Borrowed<'_, O>,
|
||||
) -> BinaryHeap<Reverse<Element>> {
|
||||
let mut result = BinaryHeap::new();
|
||||
for i in 0..mmap.storage.len() {
|
||||
let distance = mmap.quantization.distance(vector, i);
|
||||
let payload = mmap.storage.payload(i);
|
||||
result.push(Reverse(Element { distance, payload }));
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
pub fn vbase<'a, O: OperatorFlat>(
|
||||
mmap: &'a FlatMmap<O>,
|
||||
vector: Borrowed<'a, O>,
|
||||
) -> (Vec<Element>, Box<dyn Iterator<Item = Element> + 'a>) {
|
||||
let mut result = Vec::new();
|
||||
for i in 0..mmap.storage.len() {
|
||||
let distance = mmap.quantization.distance(vector, i);
|
||||
let payload = mmap.storage.payload(i);
|
||||
result.push(Element { distance, payload });
|
||||
}
|
||||
(result, Box::new(std::iter::empty()))
|
||||
}
|
||||
|
15
crates/graph/Cargo.toml
Normal file
15
crates/graph/Cargo.toml
Normal file
@ -0,0 +1,15 @@
|
||||
[package]
|
||||
name = "graph"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
|
||||
[dependencies]
|
||||
parking_lot.workspace = true
|
||||
rand.workspace = true
|
||||
|
||||
base = { path = "../base" }
|
||||
common = { path = "../common" }
|
||||
stoppable_rayon = { path = "../stoppable_rayon" }
|
||||
|
||||
[lints]
|
||||
workspace = true
|
5
crates/graph/src/lib.rs
Normal file
5
crates/graph/src/lib.rs
Normal file
@ -0,0 +1,5 @@
|
||||
#![allow(clippy::type_complexity)]
|
||||
|
||||
pub mod prune;
|
||||
pub mod search;
|
||||
pub mod visited;
|
63
crates/graph/src/prune.rs
Normal file
63
crates/graph/src/prune.rs
Normal file
@ -0,0 +1,63 @@
|
||||
use base::scalar::F32;
|
||||
|
||||
pub fn prune(
|
||||
dist: impl Fn(u32, u32) -> F32,
|
||||
u: u32,
|
||||
edges: &mut Vec<(F32, u32)>,
|
||||
add: &[(F32, u32)],
|
||||
m: u32,
|
||||
) {
|
||||
let mut trace = add.to_vec();
|
||||
trace.extend(edges.as_slice());
|
||||
trace.sort_by_key(|(_, v)| *v);
|
||||
trace.dedup_by_key(|(_, v)| *v);
|
||||
trace.retain(|(_, v)| *v != u);
|
||||
trace.sort();
|
||||
let mut res = Vec::new();
|
||||
for (dis_u, u) in trace {
|
||||
if res.len() == m as usize {
|
||||
break;
|
||||
}
|
||||
let check = res
|
||||
.iter()
|
||||
.map(|&(_, v)| dist(u, v))
|
||||
.all(|dist| dist > dis_u);
|
||||
if check {
|
||||
res.push((dis_u, u));
|
||||
}
|
||||
}
|
||||
*edges = res;
|
||||
}
|
||||
|
||||
pub fn robust_prune(
|
||||
dist: impl Fn(u32, u32) -> F32,
|
||||
u: u32,
|
||||
edges: &mut Vec<(F32, u32)>,
|
||||
add: &[(F32, u32)],
|
||||
alpha: f32,
|
||||
m: u32,
|
||||
) {
|
||||
let alpha = F32(alpha);
|
||||
// V ← (V ∪ Nout(p)) \ {p}
|
||||
let mut trace = add.to_vec();
|
||||
trace.extend(edges.as_slice());
|
||||
trace.sort_by_key(|(_, v)| *v);
|
||||
trace.dedup_by_key(|(_, v)| *v);
|
||||
trace.retain(|(_, v)| *v != u);
|
||||
trace.sort();
|
||||
// Nout(p) ← ∅
|
||||
let mut res = Vec::new();
|
||||
for (dis_u, u) in trace {
|
||||
if res.len() == m as usize {
|
||||
break;
|
||||
}
|
||||
let check = res
|
||||
.iter()
|
||||
.map(|&(_, v)| dist(u, v))
|
||||
.all(|dist| alpha * dist > dis_u);
|
||||
if check {
|
||||
res.push((dis_u, u));
|
||||
}
|
||||
}
|
||||
*edges = res;
|
||||
}
|
239
crates/graph/src/search.rs
Normal file
239
crates/graph/src/search.rs
Normal file
@ -0,0 +1,239 @@
|
||||
use crate::visited::VisitedGuard;
|
||||
use crate::visited::VisitedPool;
|
||||
use base::scalar::F32;
|
||||
use base::search::Element;
|
||||
use base::search::Payload;
|
||||
use std::cmp::Reverse;
|
||||
use std::collections::BinaryHeap;
|
||||
|
||||
trait ResultsBound: Ord {
|
||||
type T: Ord + Copy;
|
||||
fn bound(&self) -> Self::T;
|
||||
}
|
||||
|
||||
impl<T: Ord + Copy, U: Ord> ResultsBound for (T, U) {
|
||||
type T = T;
|
||||
|
||||
fn bound(&self) -> T {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl ResultsBound for Element {
|
||||
type T = F32;
|
||||
|
||||
fn bound(&self) -> F32 {
|
||||
self.distance
|
||||
}
|
||||
}
|
||||
|
||||
struct Results<T> {
|
||||
size: usize,
|
||||
heap: BinaryHeap<T>,
|
||||
}
|
||||
|
||||
impl<T: ResultsBound> Results<T> {
|
||||
fn new(size: usize) -> Self {
|
||||
assert_ne!(size, 0, "size cannot be zero");
|
||||
Results {
|
||||
size,
|
||||
heap: BinaryHeap::with_capacity(size + 1),
|
||||
}
|
||||
}
|
||||
fn push(&mut self, element: T) {
|
||||
self.heap.push(element);
|
||||
if self.heap.len() > self.size {
|
||||
self.heap.pop();
|
||||
}
|
||||
}
|
||||
fn check(&self, value: T::T) -> bool {
|
||||
if self.heap.len() < self.size {
|
||||
true
|
||||
} else {
|
||||
Some(value) < self.heap.peek().map(T::bound)
|
||||
}
|
||||
}
|
||||
fn into_sorted_vec(self) -> Vec<T> {
|
||||
self.heap.into_sorted_vec()
|
||||
}
|
||||
fn into_reversed_heap(self) -> BinaryHeap<Reverse<T>> {
|
||||
self.heap.into_iter().map(Reverse).collect()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn search<E>(
|
||||
dist: impl Fn(u32) -> F32,
|
||||
read_outs: impl Fn(u32) -> E,
|
||||
visited: &mut VisitedGuard,
|
||||
s: u32,
|
||||
ef_construction: u32,
|
||||
) -> Vec<(F32, u32)>
|
||||
where
|
||||
E: Iterator<Item = u32>,
|
||||
{
|
||||
let mut visited = visited.fetch_checker();
|
||||
let mut candidates = BinaryHeap::<Reverse<(F32, u32)>>::new();
|
||||
let mut results = Results::new(ef_construction as _);
|
||||
{
|
||||
let dis_s = dist(s);
|
||||
visited.mark(s);
|
||||
candidates.push(Reverse((dis_s, s)));
|
||||
}
|
||||
while let Some(Reverse((dis_u, u))) = candidates.pop() {
|
||||
if !results.check(dis_u) {
|
||||
break;
|
||||
}
|
||||
results.push((dis_u, u));
|
||||
for v in read_outs(u) {
|
||||
if !visited.check(v) {
|
||||
continue;
|
||||
}
|
||||
visited.mark(v);
|
||||
let dis_v = dist(v);
|
||||
if results.check(dis_v) {
|
||||
candidates.push(Reverse((dis_v, v)));
|
||||
}
|
||||
}
|
||||
}
|
||||
results.into_sorted_vec()
|
||||
}
|
||||
|
||||
pub fn search_returning_trace<E>(
|
||||
dist: impl Fn(u32) -> F32,
|
||||
read_outs: impl Fn(u32) -> E,
|
||||
visited: &mut VisitedGuard,
|
||||
s: u32,
|
||||
ef_construction: u32,
|
||||
) -> (Vec<(F32, u32)>, Vec<(F32, u32)>)
|
||||
where
|
||||
E: Iterator<Item = u32>,
|
||||
{
|
||||
let mut visited = visited.fetch_checker();
|
||||
let mut candidates = BinaryHeap::<Reverse<(F32, u32)>>::new();
|
||||
let mut results = Results::new(ef_construction as _);
|
||||
let mut trace = Vec::new();
|
||||
{
|
||||
let dis_s = dist(s);
|
||||
visited.mark(s);
|
||||
candidates.push(Reverse((dis_s, s)));
|
||||
}
|
||||
while let Some(Reverse((dis_u, u))) = candidates.pop() {
|
||||
if !results.check(dis_u) {
|
||||
break;
|
||||
}
|
||||
trace.push((dis_u, u));
|
||||
results.push((dis_u, u));
|
||||
for v in read_outs(u) {
|
||||
if !visited.check(v) {
|
||||
continue;
|
||||
}
|
||||
visited.mark(v);
|
||||
let dis_v = dist(v);
|
||||
if results.check(dis_v) {
|
||||
candidates.push(Reverse((dis_v, v)));
|
||||
}
|
||||
}
|
||||
}
|
||||
(results.into_sorted_vec(), trace)
|
||||
}
|
||||
|
||||
pub fn basic<E>(
|
||||
dist: impl Fn(u32) -> F32,
|
||||
read_outs: impl Fn(u32) -> E,
|
||||
read_payload: impl Fn(u32) -> Payload,
|
||||
visited: &VisitedPool,
|
||||
s: u32,
|
||||
ef_search: u32,
|
||||
) -> BinaryHeap<Reverse<Element>>
|
||||
where
|
||||
E: Iterator<Item = u32>,
|
||||
{
|
||||
let mut visited = visited.fetch_guard_checker();
|
||||
let mut candidates = BinaryHeap::<Reverse<(F32, u32)>>::new();
|
||||
let mut results = Results::new(ef_search as _);
|
||||
{
|
||||
let dis_s = dist(s);
|
||||
visited.mark(s);
|
||||
candidates.push(Reverse((dis_s, s)));
|
||||
}
|
||||
while let Some(Reverse((dis_u, u))) = candidates.pop() {
|
||||
if !results.check(dis_u) {
|
||||
break;
|
||||
}
|
||||
results.push(Element {
|
||||
distance: dis_u,
|
||||
payload: read_payload(u),
|
||||
});
|
||||
for v in read_outs(u) {
|
||||
if !visited.check(v) {
|
||||
continue;
|
||||
}
|
||||
visited.mark(v);
|
||||
let dis_v = dist(v);
|
||||
if results.check(dis_v) {
|
||||
candidates.push(Reverse((dis_v, v)));
|
||||
}
|
||||
}
|
||||
}
|
||||
results.into_reversed_heap()
|
||||
}
|
||||
|
||||
pub fn vbase_internal<'a, E>(
|
||||
dist: impl Fn(u32) -> F32 + 'a,
|
||||
read_outs: impl Fn(u32) -> E + 'a,
|
||||
read_payload: impl Fn(u32) -> Payload + 'a,
|
||||
visited: &'a VisitedPool,
|
||||
s: u32,
|
||||
) -> impl Iterator<Item = Element> + 'a
|
||||
where
|
||||
E: Iterator<Item = u32>,
|
||||
{
|
||||
let mut visited = visited.fetch_guard_checker();
|
||||
let mut candidates = BinaryHeap::<Reverse<(F32, u32)>>::new();
|
||||
{
|
||||
let dis_s = dist(s);
|
||||
visited.mark(s);
|
||||
candidates.push(Reverse((dis_s, s)));
|
||||
}
|
||||
std::iter::from_fn(move || {
|
||||
let Reverse((dis_u, u)) = candidates.pop()?;
|
||||
for v in read_outs(u) {
|
||||
if !visited.check(v) {
|
||||
continue;
|
||||
}
|
||||
visited.mark(v);
|
||||
let dis_v = dist(v);
|
||||
candidates.push(Reverse((dis_v, v)));
|
||||
}
|
||||
Some(Element {
|
||||
distance: dis_u,
|
||||
payload: read_payload(u),
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
pub fn vbase_generic<'a, E>(
|
||||
dist: impl Fn(u32) -> F32 + 'a,
|
||||
read_outs: impl Fn(u32) -> E + 'a,
|
||||
read_payload: impl Fn(u32) -> Payload + 'a,
|
||||
visited: &'a VisitedPool,
|
||||
s: u32,
|
||||
ef_search: u32,
|
||||
) -> (Vec<Element>, Box<(dyn Iterator<Item = Element> + 'a)>)
|
||||
where
|
||||
E: Iterator<Item = u32>,
|
||||
{
|
||||
let mut iter = vbase_internal(dist, read_outs, read_payload, visited, s);
|
||||
let mut results = Results::<Element>::new(ef_search as _);
|
||||
let mut stage1 = Vec::new();
|
||||
for x in &mut iter {
|
||||
if results.check(x.distance) {
|
||||
results.push(x);
|
||||
stage1.push(x);
|
||||
} else {
|
||||
stage1.push(x);
|
||||
break;
|
||||
}
|
||||
}
|
||||
(stage1, Box::new(iter))
|
||||
}
|
@ -12,7 +12,7 @@ impl VisitedPool {
|
||||
locked_buffers: Mutex::new(Vec::new()),
|
||||
}
|
||||
}
|
||||
pub fn fetch(&self) -> VisitedGuard {
|
||||
pub fn fetch_guard(&self) -> VisitedGuard {
|
||||
let buffer = self
|
||||
.locked_buffers
|
||||
.lock()
|
||||
@ -21,7 +21,7 @@ impl VisitedPool {
|
||||
VisitedGuard { buffer, pool: self }
|
||||
}
|
||||
|
||||
pub fn fetch2(&self) -> VisitedGuardChecker {
|
||||
pub fn fetch_guard_checker(&self) -> VisitedGuardChecker {
|
||||
let mut buffer = self
|
||||
.locked_buffers
|
||||
.lock()
|
||||
@ -43,7 +43,7 @@ pub struct VisitedGuard<'a> {
|
||||
}
|
||||
|
||||
impl<'a> VisitedGuard<'a> {
|
||||
pub fn fetch(&mut self) -> VisitedChecker<'_> {
|
||||
pub fn fetch_checker(&mut self) -> VisitedChecker<'_> {
|
||||
self.buffer.version = self.buffer.version.wrapping_add(1);
|
||||
if self.buffer.version == 0 {
|
||||
self.buffer.data.fill(0);
|
||||
@ -112,7 +112,7 @@ impl VisitedBuffer {
|
||||
pub fn new(capacity: usize) -> Self {
|
||||
Self {
|
||||
version: 0,
|
||||
data: bytemuck::zeroed_vec(capacity),
|
||||
data: base::pod::zeroed_vec(capacity),
|
||||
}
|
||||
}
|
||||
}
|
@ -4,13 +4,15 @@ version.workspace = true
|
||||
edition.workspace = true
|
||||
|
||||
[dependencies]
|
||||
bytemuck.workspace = true
|
||||
num-traits.workspace = true
|
||||
parking_lot.workspace = true
|
||||
serde_json.workspace = true
|
||||
|
||||
base = { path = "../base" }
|
||||
common = { path = "../common" }
|
||||
graph = { path = "../graph" }
|
||||
quantization = { path = "../quantization" }
|
||||
rayon = { path = "../rayon" }
|
||||
stoppable_rayon = { path = "../stoppable_rayon" }
|
||||
storage = { path = "../storage" }
|
||||
|
||||
[lints]
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -23,7 +23,7 @@ base = { path = "../base" }
|
||||
common = { path = "../common" }
|
||||
elkan_k_means = { path = "../elkan_k_means" }
|
||||
quantization = { path = "../quantization" }
|
||||
rayon = { path = "../rayon" }
|
||||
stoppable_rayon = { path = "../stoppable_rayon" }
|
||||
storage = { path = "../storage" }
|
||||
|
||||
# algorithms
|
||||
|
@ -1,8 +1,5 @@
|
||||
use crate::utils::file_wal::FileWal;
|
||||
pub use base::distance::*;
|
||||
pub use base::index::*;
|
||||
pub use base::search::*;
|
||||
pub use base::vector::*;
|
||||
use base::search::*;
|
||||
use dashmap::mapref::entry::Entry;
|
||||
use dashmap::DashMap;
|
||||
use parking_lot::Mutex;
|
||||
|
58
crates/index/src/indexing/growing.rs
Normal file
58
crates/index/src/indexing/growing.rs
Normal file
@ -0,0 +1,58 @@
|
||||
use crate::Op;
|
||||
use base::index::*;
|
||||
use base::operator::*;
|
||||
use base::search::*;
|
||||
use std::cmp::Reverse;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::convert::Infallible;
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
#[error("`GrowingIndexing` is read-only.")]
|
||||
pub struct GrowingIndexingInsertError;
|
||||
|
||||
pub enum GrowingIndexing<O: Op> {
|
||||
Infallible(Infallible, fn(O) -> O),
|
||||
}
|
||||
|
||||
impl<O: Op> GrowingIndexing<O> {
|
||||
pub fn new(_: VectorOptions, _: usize) -> Self {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
pub fn is_full(&self) -> bool {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
pub fn seal(&self) {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
pub fn insert(&self, _: O::VectorOwned, _: Payload) -> Result<(), GrowingIndexingInsertError> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
pub fn basic(&self, _: Borrowed<'_, O>, _: &SearchOptions) -> BinaryHeap<Reverse<Element>> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
pub fn vbase<'a>(
|
||||
&'a self,
|
||||
_: Borrowed<'a, O>,
|
||||
_: &'a SearchOptions,
|
||||
) -> (Vec<Element>, Box<(dyn Iterator<Item = Element> + 'a)>) {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
pub fn len(&self) -> u32 {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
pub fn vector(&self, _i: u32) -> Borrowed<'_, O> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
pub fn payload(&self, _i: u32) -> Payload {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
2
crates/index/src/indexing/mod.rs
Normal file
2
crates/index/src/indexing/mod.rs
Normal file
@ -0,0 +1,2 @@
|
||||
pub mod growing;
|
||||
pub mod sealed;
|
@ -1,9 +1,7 @@
|
||||
use crate::Op;
|
||||
pub use base::distance::*;
|
||||
pub use base::index::*;
|
||||
use base::index::*;
|
||||
use base::operator::*;
|
||||
pub use base::search::*;
|
||||
pub use base::vector::*;
|
||||
use base::search::*;
|
||||
use flat::Flat;
|
||||
use hnsw::Hnsw;
|
||||
use ivf::Ivf;
|
||||
@ -11,14 +9,18 @@ use std::cmp::Reverse;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::path::Path;
|
||||
|
||||
pub enum Indexing<O: Op> {
|
||||
pub enum SealedIndexing<O: Op> {
|
||||
Flat(Flat<O>),
|
||||
Ivf(Ivf<O>),
|
||||
Hnsw(Hnsw<O>),
|
||||
}
|
||||
|
||||
impl<O: Op> Indexing<O> {
|
||||
pub fn create<S: Source<O>>(path: &Path, options: IndexOptions, source: &S) -> Self {
|
||||
impl<O: Op> SealedIndexing<O> {
|
||||
pub fn create(
|
||||
path: impl AsRef<Path>,
|
||||
options: IndexOptions,
|
||||
source: &(impl Source<O> + Sync),
|
||||
) -> Self {
|
||||
match options.indexing {
|
||||
IndexingOptions::Flat(_) => Self::Flat(Flat::create(path, options, source)),
|
||||
IndexingOptions::Ivf(_) => Self::Ivf(Ivf::create(path, options, source)),
|
||||
@ -26,11 +28,11 @@ impl<O: Op> Indexing<O> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn open(path: &Path, options: IndexOptions) -> Self {
|
||||
pub fn open(path: impl AsRef<Path>, options: IndexOptions) -> Self {
|
||||
match options.indexing {
|
||||
IndexingOptions::Flat(_) => Self::Flat(Flat::open(path, options)),
|
||||
IndexingOptions::Ivf(_) => Self::Ivf(Ivf::open(path, options)),
|
||||
IndexingOptions::Hnsw(_) => Self::Hnsw(Hnsw::open(path, options)),
|
||||
IndexingOptions::Flat(_) => Self::Flat(Flat::open(path)),
|
||||
IndexingOptions::Ivf(_) => Self::Ivf(Ivf::open(path)),
|
||||
IndexingOptions::Hnsw(_) => Self::Hnsw(Hnsw::open(path)),
|
||||
}
|
||||
}
|
||||
|
||||
@ -40,9 +42,9 @@ impl<O: Op> Indexing<O> {
|
||||
opts: &SearchOptions,
|
||||
) -> BinaryHeap<Reverse<Element>> {
|
||||
match self {
|
||||
Indexing::Flat(x) => x.basic(vector, opts),
|
||||
Indexing::Ivf(x) => x.basic(vector, opts),
|
||||
Indexing::Hnsw(x) => x.basic(vector, opts),
|
||||
SealedIndexing::Flat(x) => x.basic(vector, opts),
|
||||
SealedIndexing::Ivf(x) => x.basic(vector, opts),
|
||||
SealedIndexing::Hnsw(x) => x.basic(vector, opts),
|
||||
}
|
||||
}
|
||||
|
||||
@ -52,33 +54,33 @@ impl<O: Op> Indexing<O> {
|
||||
opts: &'a SearchOptions,
|
||||
) -> (Vec<Element>, Box<(dyn Iterator<Item = Element> + 'a)>) {
|
||||
match self {
|
||||
Indexing::Flat(x) => x.vbase(vector, opts),
|
||||
Indexing::Ivf(x) => x.vbase(vector, opts),
|
||||
Indexing::Hnsw(x) => x.vbase(vector, opts),
|
||||
SealedIndexing::Flat(x) => x.vbase(vector, opts),
|
||||
SealedIndexing::Ivf(x) => x.vbase(vector, opts),
|
||||
SealedIndexing::Hnsw(x) => x.vbase(vector, opts),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn len(&self) -> u32 {
|
||||
match self {
|
||||
Indexing::Flat(x) => x.len(),
|
||||
Indexing::Ivf(x) => x.len(),
|
||||
Indexing::Hnsw(x) => x.len(),
|
||||
SealedIndexing::Flat(x) => x.len(),
|
||||
SealedIndexing::Ivf(x) => x.len(),
|
||||
SealedIndexing::Hnsw(x) => x.len(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn vector(&self, i: u32) -> Borrowed<'_, O> {
|
||||
match self {
|
||||
Indexing::Flat(x) => x.vector(i),
|
||||
Indexing::Ivf(x) => x.vector(i),
|
||||
Indexing::Hnsw(x) => x.vector(i),
|
||||
SealedIndexing::Flat(x) => x.vector(i),
|
||||
SealedIndexing::Ivf(x) => x.vector(i),
|
||||
SealedIndexing::Hnsw(x) => x.vector(i),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn payload(&self, i: u32) -> Payload {
|
||||
match self {
|
||||
Indexing::Flat(x) => x.payload(i),
|
||||
Indexing::Ivf(x) => x.payload(i),
|
||||
Indexing::Hnsw(x) => x.payload(i),
|
||||
SealedIndexing::Flat(x) => x.payload(i),
|
||||
SealedIndexing::Ivf(x) => x.payload(i),
|
||||
SealedIndexing::Hnsw(x) => x.payload(i),
|
||||
}
|
||||
}
|
||||
}
|
@ -1,24 +1,22 @@
|
||||
#![feature(trait_alias)]
|
||||
#![allow(clippy::len_without_is_empty)]
|
||||
|
||||
pub mod delete;
|
||||
pub mod indexing;
|
||||
pub mod optimizing;
|
||||
pub mod segments;
|
||||
pub mod segment;
|
||||
|
||||
mod utils;
|
||||
|
||||
use self::delete::Delete;
|
||||
use self::segments::growing::GrowingSegment;
|
||||
use self::segments::sealed::SealedSegment;
|
||||
use self::segment::growing::GrowingSegment;
|
||||
use self::segment::sealed::SealedSegment;
|
||||
use crate::optimizing::Optimizing;
|
||||
use crate::utils::tournament_tree::LoserTree;
|
||||
use arc_swap::ArcSwap;
|
||||
pub use base::distance::*;
|
||||
pub use base::index::*;
|
||||
use base::index::*;
|
||||
use base::operator::*;
|
||||
pub use base::search::*;
|
||||
pub use base::vector::*;
|
||||
use base::search::*;
|
||||
use base::vector::*;
|
||||
use common::clean::clean;
|
||||
use common::dir_ops::sync_dir;
|
||||
use common::file_atomic::FileAtomic;
|
||||
@ -36,12 +34,14 @@ use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::thread::JoinHandle;
|
||||
use std::time::Instant;
|
||||
use storage::operator::OperatorStorage;
|
||||
use storage::OperatorStorage;
|
||||
use thiserror::Error;
|
||||
use uuid::Uuid;
|
||||
use validator::Validate;
|
||||
|
||||
pub trait Op = Operator + OperatorElkanKMeans + OperatorQuantization + OperatorStorage;
|
||||
pub trait Op: Operator + OperatorElkanKMeans + OperatorQuantization + OperatorStorage {}
|
||||
|
||||
impl<T: Operator + OperatorElkanKMeans + OperatorQuantization + OperatorStorage> Op for T {}
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
#[error("The index view is outdated.")]
|
||||
@ -55,6 +55,7 @@ pub struct Index<O: Op> {
|
||||
view: ArcSwap<IndexView<O>>,
|
||||
instant_indexed: AtomicCell<Instant>,
|
||||
instant_written: AtomicCell<Instant>,
|
||||
check_deleted: AtomicCell<bool>,
|
||||
optimizing: Mutex<Option<(Sender<Infallible>, JoinHandle<()>)>>,
|
||||
_tracker: Arc<IndexTracker>,
|
||||
}
|
||||
@ -85,8 +86,8 @@ impl<O: Op> Index<O> {
|
||||
let startup = FileAtomic::create(
|
||||
path.join("startup"),
|
||||
IndexStartup {
|
||||
sealeds: HashSet::new(),
|
||||
growings: HashSet::new(),
|
||||
sealed_segment_ids: HashSet::new(),
|
||||
growing_segment_ids: HashSet::new(),
|
||||
alterable_options: alterable_options.clone(),
|
||||
},
|
||||
);
|
||||
@ -98,21 +99,22 @@ impl<O: Op> Index<O> {
|
||||
delete: delete.clone(),
|
||||
protect: Mutex::new(IndexProtect {
|
||||
startup,
|
||||
sealed: HashMap::new(),
|
||||
growing: HashMap::new(),
|
||||
write: None,
|
||||
sealed_segments: HashMap::new(),
|
||||
read_segments: HashMap::new(),
|
||||
write_segment: None,
|
||||
alterable_options: alterable_options.clone(),
|
||||
}),
|
||||
view: ArcSwap::new(Arc::new(IndexView {
|
||||
options: options.clone(),
|
||||
alterable_options: alterable_options.clone(),
|
||||
sealed: HashMap::new(),
|
||||
growing: HashMap::new(),
|
||||
sealed_segments: HashMap::new(),
|
||||
read_segments: HashMap::new(),
|
||||
delete: delete.clone(),
|
||||
write: None,
|
||||
write_segment: None,
|
||||
})),
|
||||
instant_indexed: AtomicCell::new(Instant::now()),
|
||||
instant_written: AtomicCell::new(Instant::now()),
|
||||
check_deleted: AtomicCell::new(false),
|
||||
optimizing: Mutex::new(None),
|
||||
_tracker: Arc::new(IndexTracker { path }),
|
||||
});
|
||||
@ -130,38 +132,44 @@ impl<O: Op> Index<O> {
|
||||
path.join("segments"),
|
||||
startup
|
||||
.get()
|
||||
.sealeds
|
||||
.sealed_segment_ids
|
||||
.iter()
|
||||
.map(|s| s.to_string())
|
||||
.chain(startup.get().growings.iter().map(|s| s.to_string())),
|
||||
.chain(
|
||||
startup
|
||||
.get()
|
||||
.growing_segment_ids
|
||||
.iter()
|
||||
.map(|s| s.to_string()),
|
||||
),
|
||||
);
|
||||
let sealed = startup
|
||||
let sealed_segments = startup
|
||||
.get()
|
||||
.sealeds
|
||||
.sealed_segment_ids
|
||||
.iter()
|
||||
.map(|&uuid| {
|
||||
.map(|&id| {
|
||||
(
|
||||
uuid,
|
||||
id,
|
||||
SealedSegment::<O>::open(
|
||||
tracker.clone(),
|
||||
path.join("segments").join(uuid.to_string()),
|
||||
uuid,
|
||||
path.join("segments").join(id.to_string()),
|
||||
id,
|
||||
options.clone(),
|
||||
),
|
||||
)
|
||||
})
|
||||
.collect::<HashMap<_, _>>();
|
||||
let growing = startup
|
||||
let read_segments = startup
|
||||
.get()
|
||||
.growings
|
||||
.growing_segment_ids
|
||||
.iter()
|
||||
.map(|&uuid| {
|
||||
.map(|&id| {
|
||||
(
|
||||
uuid,
|
||||
id,
|
||||
GrowingSegment::open(
|
||||
tracker.clone(),
|
||||
path.join("segments").join(uuid.to_string()),
|
||||
uuid,
|
||||
path.join("segments").join(id.to_string()),
|
||||
id,
|
||||
options.clone(),
|
||||
),
|
||||
)
|
||||
@ -174,21 +182,22 @@ impl<O: Op> Index<O> {
|
||||
delete: delete.clone(),
|
||||
protect: Mutex::new(IndexProtect {
|
||||
startup,
|
||||
sealed: sealed.clone(),
|
||||
growing: growing.clone(),
|
||||
write: None,
|
||||
sealed_segments: sealed_segments.clone(),
|
||||
read_segments: read_segments.clone(),
|
||||
write_segment: None,
|
||||
alterable_options: alterable_options.clone(),
|
||||
}),
|
||||
view: ArcSwap::new(Arc::new(IndexView {
|
||||
options: options.clone(),
|
||||
alterable_options: alterable_options.clone(),
|
||||
delete: delete.clone(),
|
||||
sealed,
|
||||
growing,
|
||||
write: None,
|
||||
sealed_segments,
|
||||
read_segments,
|
||||
write_segment: None,
|
||||
})),
|
||||
instant_indexed: AtomicCell::new(Instant::now()),
|
||||
instant_written: AtomicCell::new(Instant::now()),
|
||||
check_deleted: AtomicCell::new(false),
|
||||
optimizing: Mutex::new(None),
|
||||
_tracker: tracker,
|
||||
})
|
||||
@ -215,36 +224,36 @@ impl<O: Op> Index<O> {
|
||||
}
|
||||
pub fn refresh(&self) {
|
||||
let mut protect = self.protect.lock();
|
||||
if let Some((uuid, write)) = protect.write.clone() {
|
||||
if let Some((id, write)) = protect.write_segment.clone() {
|
||||
if !write.is_full() {
|
||||
return;
|
||||
}
|
||||
write.seal();
|
||||
protect.growing.insert(uuid, write);
|
||||
protect.read_segments.insert(id, write);
|
||||
}
|
||||
let write_segment_uuid = Uuid::new_v4();
|
||||
let write_segment_id = Uuid::new_v4();
|
||||
let write_segment = GrowingSegment::create(
|
||||
self._tracker.clone(),
|
||||
self.path
|
||||
.join("segments")
|
||||
.join(write_segment_uuid.to_string()),
|
||||
write_segment_uuid,
|
||||
self.options.clone(),
|
||||
.join(write_segment_id.to_string()),
|
||||
write_segment_id,
|
||||
protect.alterable_options.segment.max_growing_segment_size as usize,
|
||||
);
|
||||
protect.write = Some((write_segment_uuid, write_segment));
|
||||
protect.write_segment = Some((write_segment_id, write_segment));
|
||||
protect.maintain(self.options.clone(), self.delete.clone(), &self.view);
|
||||
self.instant_written.store(Instant::now());
|
||||
}
|
||||
pub fn seal(&self, check: Uuid) {
|
||||
let mut protect = self.protect.lock();
|
||||
if let Some((uuid, write)) = protect.write.clone() {
|
||||
if check != uuid {
|
||||
if let Some((id, write_segment)) = protect.write_segment.clone() {
|
||||
if check != id {
|
||||
return;
|
||||
}
|
||||
write.seal();
|
||||
protect.growing.insert(uuid, write);
|
||||
write_segment.seal();
|
||||
protect.read_segments.insert(id, write_segment);
|
||||
}
|
||||
protect.write = None;
|
||||
protect.write_segment = None;
|
||||
protect.maintain(self.options.clone(), self.delete.clone(), &self.view);
|
||||
self.instant_written.store(Instant::now());
|
||||
}
|
||||
@ -255,19 +264,24 @@ impl<O: Op> Index<O> {
|
||||
options: self.options().clone(),
|
||||
segments: {
|
||||
let mut segments = Vec::new();
|
||||
for sealed in view.sealed.values() {
|
||||
segments.push(sealed.stat_sealed());
|
||||
for sealed_segment in view.sealed_segments.values() {
|
||||
segments.push(sealed_segment.stat_sealed());
|
||||
}
|
||||
for growing in view.growing.values() {
|
||||
segments.push(growing.stat_growing());
|
||||
for read_segment in view.read_segments.values() {
|
||||
segments.push(read_segment.stat_read());
|
||||
}
|
||||
if let Some(write) = view.write.as_ref().map(|(_, x)| x) {
|
||||
segments.push(write.stat_write());
|
||||
if let Some(write_segment) = view.write_segment.as_ref().map(|(_, x)| x) {
|
||||
segments.push(write_segment.stat_write());
|
||||
}
|
||||
segments
|
||||
},
|
||||
}
|
||||
}
|
||||
pub fn delete(&self, p: Pointer) -> Result<(), DeleteError> {
|
||||
self.delete.delete(p);
|
||||
self.check_deleted.store(false);
|
||||
Ok(())
|
||||
}
|
||||
pub fn start(self: &Arc<Self>) {
|
||||
let mut optimizing = self.optimizing.lock();
|
||||
if optimizing.is_none() {
|
||||
@ -281,6 +295,15 @@ impl<O: Op> Index<O> {
|
||||
let _ = join_handle.join();
|
||||
}
|
||||
}
|
||||
pub fn get_check_deleted_flag(&self) -> bool {
|
||||
self.check_deleted.load()
|
||||
}
|
||||
pub fn set_check_deleted_flag(&self) {
|
||||
self.check_deleted.store(true)
|
||||
}
|
||||
pub fn check_existing(&self, payload: Payload) -> bool {
|
||||
self.delete.check(payload).is_some()
|
||||
}
|
||||
pub fn wait(&self) -> Arc<IndexTracker> {
|
||||
Arc::clone(&self._tracker)
|
||||
}
|
||||
@ -301,9 +324,9 @@ pub struct IndexView<O: Op> {
|
||||
pub options: IndexOptions,
|
||||
pub alterable_options: IndexAlterableOptions,
|
||||
pub delete: Arc<Delete>,
|
||||
pub sealed: HashMap<Uuid, Arc<SealedSegment<O>>>,
|
||||
pub growing: HashMap<Uuid, Arc<GrowingSegment<O>>>,
|
||||
pub write: Option<(Uuid, Arc<GrowingSegment<O>>)>,
|
||||
pub sealed_segments: HashMap<Uuid, Arc<SealedSegment<O>>>,
|
||||
pub read_segments: HashMap<Uuid, Arc<GrowingSegment<O>>>,
|
||||
pub write_segment: Option<(Uuid, Arc<GrowingSegment<O>>)>,
|
||||
}
|
||||
|
||||
impl<O: Op> IndexView<O> {
|
||||
@ -331,17 +354,17 @@ impl<O: Op> IndexView<O> {
|
||||
}
|
||||
}
|
||||
|
||||
let n = self.sealed.len() + self.growing.len() + 1;
|
||||
let n = self.sealed_segments.len() + self.read_segments.len() + 1;
|
||||
let mut heaps = Vec::with_capacity(1 + n);
|
||||
for (_, sealed) in self.sealed.iter() {
|
||||
for (_, sealed) in self.sealed_segments.iter() {
|
||||
let p = sealed.basic(vector, opts);
|
||||
heaps.push(Comparer(p));
|
||||
}
|
||||
for (_, growing) in self.growing.iter() {
|
||||
let p = growing.basic(vector, opts);
|
||||
for (_, read) in self.read_segments.iter() {
|
||||
let p = read.basic(vector, opts);
|
||||
heaps.push(Comparer(p));
|
||||
}
|
||||
if let Some((_, write)) = &self.write {
|
||||
if let Some((_, write)) = &self.write_segment {
|
||||
let p = write.basic(vector, opts);
|
||||
heaps.push(Comparer(p));
|
||||
}
|
||||
@ -368,20 +391,20 @@ impl<O: Op> IndexView<O> {
|
||||
});
|
||||
}
|
||||
|
||||
let n = self.sealed.len() + self.growing.len() + 1;
|
||||
let n = self.sealed_segments.len() + self.read_segments.len() + 1;
|
||||
let mut alpha = Vec::new();
|
||||
let mut beta = Vec::with_capacity(1 + n);
|
||||
for (_, sealed) in self.sealed.iter() {
|
||||
for (_, sealed) in self.sealed_segments.iter() {
|
||||
let (stage1, stage2) = sealed.vbase(vector, opts);
|
||||
alpha.extend(stage1);
|
||||
beta.push(stage2);
|
||||
}
|
||||
for (_, growing) in self.growing.iter() {
|
||||
let (stage1, stage2) = growing.vbase(vector, opts);
|
||||
for (_, read) in self.read_segments.iter() {
|
||||
let (stage1, stage2) = read.vbase(vector, opts);
|
||||
alpha.extend(stage1);
|
||||
beta.push(stage2);
|
||||
}
|
||||
if let Some((_, write)) = &self.write {
|
||||
if let Some((_, write)) = &self.write_segment {
|
||||
let (stage1, stage2) = write.vbase(vector, opts);
|
||||
alpha.extend(stage1);
|
||||
beta.push(stage2);
|
||||
@ -398,22 +421,22 @@ impl<O: Op> IndexView<O> {
|
||||
}))
|
||||
}
|
||||
pub fn list(&self) -> Result<impl Iterator<Item = Pointer> + '_, ListError> {
|
||||
let sealed = self
|
||||
.sealed
|
||||
let sealed_segments = self
|
||||
.sealed_segments
|
||||
.values()
|
||||
.flat_map(|x| (0..x.len()).map(|i| x.payload(i)));
|
||||
let growing = self
|
||||
.growing
|
||||
let read_segments = self
|
||||
.read_segments
|
||||
.values()
|
||||
.flat_map(|x| (0..x.len()).map(|i| x.payload(i)));
|
||||
let write = self
|
||||
.write
|
||||
let write_segments = self
|
||||
.write_segment
|
||||
.iter()
|
||||
.map(|(_, x)| x)
|
||||
.flat_map(|x| (0..x.len()).map(|i| x.payload(i)));
|
||||
let iter = sealed
|
||||
.chain(growing)
|
||||
.chain(write)
|
||||
let iter = sealed_segments
|
||||
.chain(read_segments)
|
||||
.chain(write_segments)
|
||||
.filter_map(|p| self.delete.check(p));
|
||||
Ok(iter)
|
||||
}
|
||||
@ -427,9 +450,9 @@ impl<O: Op> IndexView<O> {
|
||||
}
|
||||
|
||||
let payload = Payload::new(pointer, self.delete.version(pointer));
|
||||
if let Some((_, growing)) = self.write.as_ref() {
|
||||
use crate::segments::growing::GrowingSegmentInsertError;
|
||||
if let Err(GrowingSegmentInsertError) = growing.insert(vector, payload) {
|
||||
if let Some((_, segment)) = self.write_segment.as_ref() {
|
||||
use crate::segment::growing::GrowingSegmentInsertError;
|
||||
if let Err(GrowingSegmentInsertError) = segment.insert(vector, payload) {
|
||||
return Ok(Err(OutdatedError));
|
||||
}
|
||||
Ok(Ok(()))
|
||||
@ -437,13 +460,9 @@ impl<O: Op> IndexView<O> {
|
||||
Ok(Err(OutdatedError))
|
||||
}
|
||||
}
|
||||
pub fn delete(&self, p: Pointer) -> Result<(), DeleteError> {
|
||||
self.delete.delete(p);
|
||||
Ok(())
|
||||
}
|
||||
pub fn flush(&self) -> Result<(), FlushError> {
|
||||
self.delete.flush();
|
||||
if let Some((_, write)) = &self.write {
|
||||
if let Some((_, write)) = &self.write_segment {
|
||||
write.flush();
|
||||
}
|
||||
Ok(())
|
||||
@ -452,16 +471,16 @@ impl<O: Op> IndexView<O> {
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
struct IndexStartup {
|
||||
sealeds: HashSet<Uuid>,
|
||||
growings: HashSet<Uuid>,
|
||||
sealed_segment_ids: HashSet<Uuid>,
|
||||
growing_segment_ids: HashSet<Uuid>,
|
||||
alterable_options: IndexAlterableOptions,
|
||||
}
|
||||
|
||||
struct IndexProtect<O: Op> {
|
||||
startup: FileAtomic<IndexStartup>,
|
||||
sealed: HashMap<Uuid, Arc<SealedSegment<O>>>,
|
||||
growing: HashMap<Uuid, Arc<GrowingSegment<O>>>,
|
||||
write: Option<(Uuid, Arc<GrowingSegment<O>>)>,
|
||||
sealed_segments: HashMap<Uuid, Arc<SealedSegment<O>>>,
|
||||
read_segments: HashMap<Uuid, Arc<GrowingSegment<O>>>,
|
||||
write_segment: Option<(Uuid, Arc<GrowingSegment<O>>)>,
|
||||
alterable_options: IndexAlterableOptions,
|
||||
}
|
||||
|
||||
@ -476,16 +495,17 @@ impl<O: Op> IndexProtect<O> {
|
||||
options,
|
||||
alterable_options: self.alterable_options.clone(),
|
||||
delete,
|
||||
sealed: self.sealed.clone(),
|
||||
growing: self.growing.clone(),
|
||||
write: self.write.clone(),
|
||||
sealed_segments: self.sealed_segments.clone(),
|
||||
read_segments: self.read_segments.clone(),
|
||||
write_segment: self.write_segment.clone(),
|
||||
});
|
||||
let startup_write = self.write.as_ref().map(|(uuid, _)| *uuid);
|
||||
let startup_sealeds = self.sealed.keys().copied().collect();
|
||||
let startup_growings = self.growing.keys().copied().chain(startup_write).collect();
|
||||
let read_segment_ids = self.read_segments.keys().copied();
|
||||
let write_segment_id = self.write_segment.as_ref().map(|(id, _)| *id);
|
||||
let growing_segment_ids = read_segment_ids.chain(write_segment_id).collect();
|
||||
let sealed_segment_ids = self.sealed_segments.keys().copied().collect();
|
||||
self.startup.set(IndexStartup {
|
||||
sealeds: startup_sealeds,
|
||||
growings: startup_growings,
|
||||
sealed_segment_ids,
|
||||
growing_segment_ids,
|
||||
alterable_options: self.alterable_options.clone(),
|
||||
});
|
||||
swap.swap(view);
|
||||
|
@ -1,14 +1,18 @@
|
||||
use crate::delete::Delete;
|
||||
use crate::Op;
|
||||
use crate::{GrowingSegment, SealedSegment};
|
||||
use base::index::IndexOptions;
|
||||
use base::operator::Borrowed;
|
||||
use base::search::*;
|
||||
use std::any::Any;
|
||||
use std::fmt::Debug;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub struct IndexSource<O: Op> {
|
||||
pub(super) sealed: Option<Arc<SealedSegment<O>>>,
|
||||
pub(super) growing: Vec<Arc<GrowingSegment<O>>>,
|
||||
pub(super) dims: u32,
|
||||
pub(super) delete: Arc<Delete>,
|
||||
}
|
||||
|
||||
impl<O: Op> IndexSource<O> {
|
||||
@ -16,16 +20,18 @@ impl<O: Op> IndexSource<O> {
|
||||
options: IndexOptions,
|
||||
sealed: Option<Arc<SealedSegment<O>>>,
|
||||
growing: Vec<Arc<GrowingSegment<O>>>,
|
||||
delete: Arc<Delete>,
|
||||
) -> Self {
|
||||
IndexSource {
|
||||
sealed,
|
||||
growing,
|
||||
dims: options.vector.dims,
|
||||
delete,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<O: Op> Collection<O> for IndexSource<O> {
|
||||
impl<O: Op> Vectors<O> for IndexSource<O> {
|
||||
fn dims(&self) -> u32 {
|
||||
self.dims
|
||||
}
|
||||
@ -50,7 +56,9 @@ impl<O: Op> Collection<O> for IndexSource<O> {
|
||||
}
|
||||
panic!("Out of bound.")
|
||||
}
|
||||
}
|
||||
|
||||
impl<O: Op> Collection<O> for IndexSource<O> {
|
||||
fn payload(&self, mut index: u32) -> Payload {
|
||||
for x in self.sealed.iter() {
|
||||
if index < x.len() {
|
||||
@ -68,4 +76,67 @@ impl<O: Op> Collection<O> for IndexSource<O> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<O: Op> Source<O> for IndexSource<O> {}
|
||||
impl<O: Op> Source<O> for IndexSource<O> {
|
||||
fn get_main<T: Any>(&self) -> Option<&T> {
|
||||
let x = self.sealed.as_ref()?;
|
||||
Some(
|
||||
x.indexing()
|
||||
.downcast_ref::<T>()
|
||||
.expect("called with incorrect index type"),
|
||||
)
|
||||
}
|
||||
|
||||
fn get_main_len(&self) -> u32 {
|
||||
self.sealed.as_ref().map(|x| x.len()).unwrap_or_default()
|
||||
}
|
||||
|
||||
fn check_existing(&self, i: u32) -> bool {
|
||||
self.delete.check(self.payload(i)).is_some()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RoGrowingCollection<O: Op> {
|
||||
pub(super) growing: Vec<Arc<GrowingSegment<O>>>,
|
||||
pub(super) dims: u32,
|
||||
}
|
||||
|
||||
impl<O: Op> Debug for RoGrowingCollection<O> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("RoGrowingCollection")
|
||||
.field("growing", &self.growing)
|
||||
.field("dims", &self.dims)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl<O: Op> Vectors<O> for RoGrowingCollection<O> {
|
||||
fn dims(&self) -> u32 {
|
||||
self.dims
|
||||
}
|
||||
|
||||
fn len(&self) -> u32 {
|
||||
self.growing.iter().map(|x| x.len()).sum::<u32>()
|
||||
}
|
||||
|
||||
fn vector(&self, mut index: u32) -> Borrowed<'_, O> {
|
||||
for x in self.growing.iter() {
|
||||
if index < x.len() {
|
||||
return x.vector(index);
|
||||
}
|
||||
index -= x.len();
|
||||
}
|
||||
panic!("Out of bound.")
|
||||
}
|
||||
}
|
||||
|
||||
impl<O: Op> Collection<O> for RoGrowingCollection<O> {
|
||||
fn payload(&self, mut index: u32) -> Payload {
|
||||
for x in self.growing.iter() {
|
||||
if index < x.len() {
|
||||
return x.payload(index);
|
||||
}
|
||||
index -= x.len();
|
||||
}
|
||||
panic!("Out of bound.")
|
||||
}
|
||||
}
|
||||
|
@ -2,34 +2,32 @@ use crate::optimizing::index_source::IndexSource;
|
||||
use crate::Index;
|
||||
use crate::Op;
|
||||
use crate::SealedSegment;
|
||||
pub use base::distance::*;
|
||||
pub use base::index::*;
|
||||
pub use base::search::*;
|
||||
pub use base::vector::*;
|
||||
use std::sync::Arc;
|
||||
use uuid::Uuid;
|
||||
|
||||
pub fn scan<O: Op>(index: Arc<Index<O>>) -> Option<IndexSource<O>> {
|
||||
pub fn scan<O: Op>(
|
||||
index: Arc<Index<O>>,
|
||||
capacity: u32,
|
||||
delete_threshold: f64,
|
||||
) -> Option<IndexSource<O>> {
|
||||
let (sealed, growing) = 'a: {
|
||||
let protect = index.protect.lock();
|
||||
// task 1 approach 1: merge small segments to a big segment
|
||||
// approach 1: merge small segments to a big segment
|
||||
{
|
||||
let mut counter = 0u64;
|
||||
let base_segment = {
|
||||
let mut sealed_segments = protect.sealed.values().collect::<Vec<_>>();
|
||||
let mut sealed_segments = protect.sealed_segments.values().collect::<Vec<_>>();
|
||||
sealed_segments.sort_by_key(|s| s.len());
|
||||
let base_segment = sealed_segments.first().cloned();
|
||||
counter += base_segment.map(|x| x.len() as u64).unwrap_or_default();
|
||||
base_segment.cloned()
|
||||
};
|
||||
let delta_segments = {
|
||||
let mut growing_segments = protect.growing.values().collect::<Vec<_>>();
|
||||
let mut growing_segments = protect.read_segments.values().collect::<Vec<_>>();
|
||||
growing_segments.sort_by_key(|s| s.len());
|
||||
let mut delta_segments = Vec::new();
|
||||
for growing_segment in growing_segments.iter().cloned().cloned() {
|
||||
if counter + growing_segment.len() as u64
|
||||
<= index.options.segment.max_sealed_segment_size as u64
|
||||
{
|
||||
if counter + growing_segment.len() as u64 <= capacity as u64 {
|
||||
counter += growing_segment.len() as u64;
|
||||
delta_segments.push(growing_segment);
|
||||
} else {
|
||||
@ -42,17 +40,15 @@ pub fn scan<O: Op>(index: Arc<Index<O>>) -> Option<IndexSource<O>> {
|
||||
break 'a (base_segment, delta_segments);
|
||||
}
|
||||
}
|
||||
// task 1 approach 2: merge small segments
|
||||
// approach 2: merge small segments
|
||||
{
|
||||
let mut counter = 0u64;
|
||||
let delta_segments = {
|
||||
let mut growing_segments = protect.growing.values().collect::<Vec<_>>();
|
||||
let mut growing_segments = protect.read_segments.values().collect::<Vec<_>>();
|
||||
growing_segments.sort_by_key(|s| s.len());
|
||||
let mut delta_segments = Vec::new();
|
||||
for growing_segment in growing_segments.iter().cloned().cloned() {
|
||||
if counter + growing_segment.len() as u64
|
||||
<= index.options.segment.max_sealed_segment_size as u64
|
||||
{
|
||||
if counter + growing_segment.len() as u64 <= capacity as u64 {
|
||||
counter += growing_segment.len() as u64;
|
||||
delta_segments.push(growing_segment);
|
||||
} else {
|
||||
@ -65,45 +61,63 @@ pub fn scan<O: Op>(index: Arc<Index<O>>) -> Option<IndexSource<O>> {
|
||||
break 'a (None, delta_segments);
|
||||
}
|
||||
}
|
||||
// approach 3: vacuum sealed segment
|
||||
if !index.get_check_deleted_flag() {
|
||||
let sealed_segments = protect.sealed_segments.values().collect::<Vec<_>>();
|
||||
for sealed_segment in sealed_segments {
|
||||
let mut counter = 0u64;
|
||||
for i in 0..sealed_segment.len() {
|
||||
if !index.check_existing(sealed_segment.payload(i)) {
|
||||
counter += 1;
|
||||
}
|
||||
}
|
||||
let value = counter as f64 / sealed_segment.len() as f64;
|
||||
if value >= delete_threshold {
|
||||
break 'a (Some(sealed_segment.clone()), Vec::new());
|
||||
}
|
||||
}
|
||||
index.set_check_deleted_flag();
|
||||
}
|
||||
return None;
|
||||
};
|
||||
Some(IndexSource::new(
|
||||
index.options().clone(),
|
||||
sealed.clone(),
|
||||
growing.clone(),
|
||||
index.delete.clone(),
|
||||
))
|
||||
}
|
||||
|
||||
pub fn make<O: Op>(index: Arc<Index<O>>, source: IndexSource<O>) {
|
||||
let next = {
|
||||
let uuid = Uuid::new_v4();
|
||||
let id = Uuid::new_v4();
|
||||
SealedSegment::create(
|
||||
index._tracker.clone(),
|
||||
index.path.join("segments").join(uuid.to_string()),
|
||||
uuid,
|
||||
index.path.join("segments").join(id.to_string()),
|
||||
id,
|
||||
index.options.clone(),
|
||||
&source,
|
||||
)
|
||||
};
|
||||
let mut protect = index.protect.lock();
|
||||
for sealed in source.sealed.iter() {
|
||||
if protect.sealed.contains_key(&sealed.uuid()) {
|
||||
for sealed_segment in source.sealed.iter() {
|
||||
if protect.sealed_segments.contains_key(&sealed_segment.id()) {
|
||||
continue;
|
||||
}
|
||||
return;
|
||||
}
|
||||
for growing in source.growing.iter() {
|
||||
if protect.growing.contains_key(&growing.uuid()) {
|
||||
for growing_segment in source.growing.iter() {
|
||||
if protect.read_segments.contains_key(&growing_segment.id()) {
|
||||
continue;
|
||||
}
|
||||
return;
|
||||
}
|
||||
for sealed in source.sealed.iter() {
|
||||
protect.sealed.remove(&sealed.uuid());
|
||||
for sealed_segment in source.sealed.iter() {
|
||||
protect.sealed_segments.remove(&sealed_segment.id());
|
||||
}
|
||||
for growing in source.growing.iter() {
|
||||
protect.growing.remove(&growing.uuid());
|
||||
for growing_segment in source.growing.iter() {
|
||||
protect.read_segments.remove(&growing_segment.id());
|
||||
}
|
||||
protect.sealed.insert(next.uuid(), next);
|
||||
protect.sealed_segments.insert(next.id(), next);
|
||||
protect.maintain(index.options.clone(), index.delete.clone(), &index.view);
|
||||
}
|
||||
|
@ -39,13 +39,13 @@ impl<O: Op> Optimizing<O> {
|
||||
Box::new(move || {
|
||||
let view = index.view();
|
||||
let stamp = view
|
||||
.write
|
||||
.write_segment
|
||||
.as_ref()
|
||||
.map(|(uuid, segment)| (*uuid, segment.len()));
|
||||
.map(|(id, segment)| (*id, segment.len()));
|
||||
if first || stamp == check {
|
||||
if let Some((uuid, len)) = stamp {
|
||||
if let Some((id, len)) = stamp {
|
||||
if len >= view.alterable_options.optimizing.sealing_size {
|
||||
index.seal(uuid);
|
||||
index.seal(id);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@ -59,8 +59,12 @@ impl<O: Op> Optimizing<O> {
|
||||
Instant::now(),
|
||||
Box::new(|| {
|
||||
let view = index.view();
|
||||
if let Some(source) = scan(index.clone()) {
|
||||
rayon::ThreadPoolBuilder::new()
|
||||
if let Some(source) = scan(
|
||||
index.clone(),
|
||||
view.alterable_options.segment.max_sealed_segment_size,
|
||||
view.alterable_options.optimizing.delete_threshold,
|
||||
) {
|
||||
stoppable_rayon::ThreadPoolBuilder::new()
|
||||
.num_threads(view.alterable_options.optimizing.optimizing_threads as usize)
|
||||
.build_scoped(|pool| {
|
||||
let (stop_tx, stop_rx) = bounded::<Infallible>(0);
|
||||
@ -93,7 +97,8 @@ impl<O: Op> Optimizing<O> {
|
||||
Instant::now()
|
||||
} else {
|
||||
index.instant_indexed.store(Instant::now());
|
||||
Instant::now() + Duration::from_secs(60)
|
||||
Instant::now()
|
||||
+ Duration::from_secs(view.alterable_options.optimizing.optimizing_secs)
|
||||
}
|
||||
}),
|
||||
);
|
||||
|
@ -12,6 +12,7 @@ use serde::{Deserialize, Serialize};
|
||||
use std::cell::UnsafeCell;
|
||||
use std::cmp::Reverse;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::fmt::Debug;
|
||||
use std::mem::MaybeUninit;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
@ -20,11 +21,11 @@ use thiserror::Error;
|
||||
use uuid::Uuid;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
#[error("`GrowingSegment` stopped growing.")]
|
||||
#[error("`GrowingSegment` is read-only.")]
|
||||
pub struct GrowingSegmentInsertError;
|
||||
|
||||
pub struct GrowingSegment<O: Op> {
|
||||
uuid: Uuid,
|
||||
id: Uuid,
|
||||
vec: Vec<MaybeUninit<UnsafeCell<Log<O>>>>,
|
||||
wal: Mutex<FileWal>,
|
||||
len: AtomicUsize,
|
||||
@ -32,29 +33,36 @@ pub struct GrowingSegment<O: Op> {
|
||||
_tracker: Arc<SegmentTracker>,
|
||||
}
|
||||
|
||||
impl<O: Op> Debug for GrowingSegment<O> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("GrowingSegment")
|
||||
.field("id", &self.id)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl<O: Op> GrowingSegment<O> {
|
||||
pub fn create(
|
||||
_tracker: Arc<IndexTracker>,
|
||||
path: PathBuf,
|
||||
uuid: Uuid,
|
||||
options: IndexOptions,
|
||||
id: Uuid,
|
||||
capacity: usize,
|
||||
) -> Arc<Self> {
|
||||
std::fs::create_dir(&path).unwrap();
|
||||
let wal = FileWal::create(path.join("wal"));
|
||||
let capacity = options.segment.max_growing_segment_size;
|
||||
sync_dir(&path);
|
||||
Arc::new(Self {
|
||||
uuid,
|
||||
id,
|
||||
vec: unsafe {
|
||||
let mut vec = Vec::with_capacity(capacity as usize);
|
||||
vec.set_len(capacity as usize);
|
||||
let mut vec = Vec::with_capacity(capacity);
|
||||
vec.set_len(capacity);
|
||||
vec
|
||||
},
|
||||
wal: Mutex::new(wal),
|
||||
len: AtomicUsize::new(0),
|
||||
pro: Mutex::new(Protect {
|
||||
inflight: 0,
|
||||
capacity: capacity as usize,
|
||||
capacity,
|
||||
}),
|
||||
_tracker: Arc::new(SegmentTracker { path, _tracker }),
|
||||
})
|
||||
@ -63,7 +71,7 @@ impl<O: Op> GrowingSegment<O> {
|
||||
pub fn open(
|
||||
_tracker: Arc<IndexTracker>,
|
||||
path: PathBuf,
|
||||
uuid: Uuid,
|
||||
id: Uuid,
|
||||
_: IndexOptions,
|
||||
) -> Arc<Self> {
|
||||
let mut wal = FileWal::open(path.join("wal"));
|
||||
@ -75,7 +83,7 @@ impl<O: Op> GrowingSegment<O> {
|
||||
wal.truncate();
|
||||
let n = vec.len();
|
||||
Arc::new(Self {
|
||||
uuid,
|
||||
id,
|
||||
vec,
|
||||
wal: { Mutex::new(wal) },
|
||||
len: AtomicUsize::new(n),
|
||||
@ -87,8 +95,8 @@ impl<O: Op> GrowingSegment<O> {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn uuid(&self) -> Uuid {
|
||||
self.uuid
|
||||
pub fn id(&self) -> Uuid {
|
||||
self.id
|
||||
}
|
||||
|
||||
pub fn is_full(&self) -> bool {
|
||||
@ -155,21 +163,23 @@ impl<O: Op> GrowingSegment<O> {
|
||||
self.len.load(Ordering::Acquire) as u32
|
||||
}
|
||||
|
||||
pub fn stat_growing(&self) -> SegmentStat {
|
||||
pub fn stat_read(&self) -> SegmentStat {
|
||||
let len = self.len();
|
||||
SegmentStat {
|
||||
id: self.uuid,
|
||||
id: self.id,
|
||||
r#type: "growing".to_string(),
|
||||
length: self.len() as usize,
|
||||
size: (self.len() as u64) * (std::mem::size_of::<Log<O>>() as u64),
|
||||
length: len as usize,
|
||||
size: (len as u64) * (std::mem::size_of::<Log<O>>() as u64),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn stat_write(&self) -> SegmentStat {
|
||||
let len = self.len();
|
||||
SegmentStat {
|
||||
id: self.uuid,
|
||||
id: self.id,
|
||||
r#type: "write".to_string(),
|
||||
length: self.len() as usize,
|
||||
size: (self.len() as u64) * (std::mem::size_of::<Log<O>>() as u64),
|
||||
length: len as usize,
|
||||
size: (len as u64) * (std::mem::size_of::<Log<O>>() as u64),
|
||||
}
|
||||
}
|
||||
|
@ -1,15 +1,16 @@
|
||||
use super::SegmentTracker;
|
||||
use crate::indexing::Indexing;
|
||||
use crate::indexing::sealed::SealedIndexing;
|
||||
use crate::utils::dir_ops::dir_size;
|
||||
use crate::IndexTracker;
|
||||
use crate::Op;
|
||||
use base::index::*;
|
||||
use base::operator::*;
|
||||
use base::search::*;
|
||||
use common::dir_ops::sync_dir;
|
||||
use crossbeam::atomic::AtomicCell;
|
||||
use std::any::Any;
|
||||
use std::cmp::Reverse;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::fmt::Debug;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
@ -17,25 +18,31 @@ use std::time::Instant;
|
||||
use uuid::Uuid;
|
||||
|
||||
pub struct SealedSegment<O: Op> {
|
||||
uuid: Uuid,
|
||||
indexing: Indexing<O>,
|
||||
id: Uuid,
|
||||
indexing: SealedIndexing<O>,
|
||||
deletes: AtomicCell<(Instant, u32)>,
|
||||
_tracker: Arc<SegmentTracker>,
|
||||
}
|
||||
|
||||
impl<O: Op> Debug for SealedSegment<O> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("SealedSegment")
|
||||
.field("id", &self.id)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl<O: Op> SealedSegment<O> {
|
||||
pub fn create<S: Source<O>>(
|
||||
pub fn create(
|
||||
_tracker: Arc<IndexTracker>,
|
||||
path: PathBuf,
|
||||
uuid: Uuid,
|
||||
id: Uuid,
|
||||
options: IndexOptions,
|
||||
source: &S,
|
||||
source: &(impl Source<O> + Sync),
|
||||
) -> Arc<Self> {
|
||||
std::fs::create_dir(&path).unwrap();
|
||||
let indexing = Indexing::create(&path.join("indexing"), options, source);
|
||||
sync_dir(&path);
|
||||
let indexing = SealedIndexing::create(&path, options, source);
|
||||
Arc::new(Self {
|
||||
uuid,
|
||||
id,
|
||||
indexing,
|
||||
deletes: AtomicCell::new((Instant::now(), 0)),
|
||||
_tracker: Arc::new(SegmentTracker { path, _tracker }),
|
||||
@ -45,29 +52,28 @@ impl<O: Op> SealedSegment<O> {
|
||||
pub fn open(
|
||||
_tracker: Arc<IndexTracker>,
|
||||
path: PathBuf,
|
||||
uuid: Uuid,
|
||||
id: Uuid,
|
||||
options: IndexOptions,
|
||||
) -> Arc<Self> {
|
||||
let indexing = Indexing::open(&path.join("indexing"), options);
|
||||
let indexing = SealedIndexing::open(&path, options);
|
||||
Arc::new(Self {
|
||||
uuid,
|
||||
id,
|
||||
indexing,
|
||||
deletes: AtomicCell::new((Instant::now(), 0)),
|
||||
_tracker: Arc::new(SegmentTracker { path, _tracker }),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn uuid(&self) -> Uuid {
|
||||
self.uuid
|
||||
pub fn id(&self) -> Uuid {
|
||||
self.id
|
||||
}
|
||||
|
||||
pub fn stat_sealed(&self) -> SegmentStat {
|
||||
let path = self._tracker.path.join("indexing");
|
||||
SegmentStat {
|
||||
id: self.uuid,
|
||||
id: self.id,
|
||||
r#type: "sealed".to_string(),
|
||||
length: self.len() as usize,
|
||||
size: dir_size(&path).unwrap(),
|
||||
size: dir_size(&self._tracker.path).unwrap(),
|
||||
}
|
||||
}
|
||||
|
||||
@ -114,4 +120,12 @@ impl<O: Op> SealedSegment<O> {
|
||||
Err(c)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn indexing(&self) -> &dyn Any {
|
||||
match &self.indexing {
|
||||
SealedIndexing::Flat(x) => x,
|
||||
SealedIndexing::Ivf(x) => x,
|
||||
SealedIndexing::Hnsw(x) => x,
|
||||
}
|
||||
}
|
||||
}
|
@ -2,9 +2,9 @@ use std::fs::read_dir;
|
||||
use std::io;
|
||||
use std::path::Path;
|
||||
|
||||
pub fn dir_size(dir: &Path) -> io::Result<u64> {
|
||||
pub fn dir_size(dir: impl AsRef<Path>) -> io::Result<u64> {
|
||||
let mut size = 0;
|
||||
if dir.is_dir() {
|
||||
if dir.as_ref().is_dir() {
|
||||
for entry in read_dir(dir)? {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
|
@ -12,7 +12,7 @@ base = { path = "../base" }
|
||||
common = { path = "../common" }
|
||||
elkan_k_means = { path = "../elkan_k_means" }
|
||||
quantization = { path = "../quantization" }
|
||||
rayon = { path = "../rayon" }
|
||||
stoppable_rayon = { path = "../stoppable_rayon" }
|
||||
storage = { path = "../storage" }
|
||||
|
||||
[lints]
|
||||
|
@ -1,53 +1,51 @@
|
||||
use super::OperatorIvf as Op;
|
||||
use base::index::*;
|
||||
use base::operator::*;
|
||||
use base::scalar::F32;
|
||||
use base::search::*;
|
||||
use base::vector::*;
|
||||
use common::dir_ops::sync_dir;
|
||||
use common::json::Json;
|
||||
use common::mmap_array::MmapArray;
|
||||
use common::remap::RemappedCollection;
|
||||
use common::vec2::Vec2;
|
||||
use elkan_k_means::ElkanKMeans;
|
||||
use num_traits::Float;
|
||||
use elkan_k_means::elkan_k_means;
|
||||
use elkan_k_means::elkan_k_means_caluate;
|
||||
use elkan_k_means::elkan_k_means_lookup;
|
||||
use quantization::Quantization;
|
||||
use rand::seq::index::sample;
|
||||
use rand::thread_rng;
|
||||
use rayon::iter::{IndexedParallelIterator, IntoParallelRefMutIterator, ParallelIterator};
|
||||
use std::cmp::Reverse;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::fs::create_dir;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use storage::StorageCollection;
|
||||
use stoppable_rayon as rayon;
|
||||
use storage::Storage;
|
||||
|
||||
pub struct IvfNaive<O: Op> {
|
||||
mmap: IvfMmap<O>,
|
||||
storage: O::Storage,
|
||||
quantization: Quantization<O>,
|
||||
payloads: MmapArray<Payload>,
|
||||
offsets: Json<Vec<u32>>,
|
||||
centroids: Json<Vec2<Scalar<O>>>,
|
||||
}
|
||||
|
||||
impl<O: Op> IvfNaive<O> {
|
||||
pub fn create<S: Source<O>>(path: &Path, options: IndexOptions, source: &S) -> Self {
|
||||
create_dir(path).unwrap();
|
||||
let ram = make(path, options, source);
|
||||
let mmap = save(ram, path);
|
||||
sync_dir(path);
|
||||
Self { mmap }
|
||||
pub fn create(path: impl AsRef<Path>, options: IndexOptions, source: &impl Source<O>) -> Self {
|
||||
let remapped = RemappedCollection::from_source(source);
|
||||
from_nothing(path, options, &remapped)
|
||||
}
|
||||
|
||||
pub fn open(path: &Path, options: IndexOptions) -> Self {
|
||||
let mmap = open(path, options);
|
||||
Self { mmap }
|
||||
pub fn open(path: impl AsRef<Path>) -> Self {
|
||||
open(path)
|
||||
}
|
||||
|
||||
pub fn len(&self) -> u32 {
|
||||
self.mmap.storage.len()
|
||||
self.storage.len()
|
||||
}
|
||||
|
||||
pub fn vector(&self, i: u32) -> Borrowed<'_, O> {
|
||||
self.mmap.storage.vector(i)
|
||||
self.storage.vector(i)
|
||||
}
|
||||
|
||||
pub fn payload(&self, i: u32) -> Payload {
|
||||
self.mmap.storage.payload(i)
|
||||
self.payloads[i as usize]
|
||||
}
|
||||
|
||||
pub fn basic(
|
||||
@ -55,7 +53,20 @@ impl<O: Op> IvfNaive<O> {
|
||||
vector: Borrowed<'_, O>,
|
||||
opts: &SearchOptions,
|
||||
) -> BinaryHeap<Reverse<Element>> {
|
||||
basic(&self.mmap, vector, opts.ivf_nprobe)
|
||||
let mut lists = elkan_k_means_caluate::<O>(vector, &self.centroids);
|
||||
lists.select_nth_unstable(opts.ivf_nprobe as usize);
|
||||
lists.truncate(opts.ivf_nprobe as usize);
|
||||
let mut result = BinaryHeap::new();
|
||||
for i in lists.iter().map(|(_, i)| *i) {
|
||||
let start = self.offsets[i];
|
||||
let end = self.offsets[i + 1];
|
||||
for j in start..end {
|
||||
let payload = self.payloads[j as usize];
|
||||
let distance = self.quantization.distance(&self.storage, vector, j);
|
||||
result.push(Reverse(Element { distance, payload }));
|
||||
}
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
pub fn vbase<'a>(
|
||||
@ -63,233 +74,85 @@ impl<O: Op> IvfNaive<O> {
|
||||
vector: Borrowed<'a, O>,
|
||||
opts: &'a SearchOptions,
|
||||
) -> (Vec<Element>, Box<(dyn Iterator<Item = Element> + 'a)>) {
|
||||
vbase(&self.mmap, vector, opts.ivf_nprobe)
|
||||
let mut lists = elkan_k_means_caluate::<O>(vector, &self.centroids);
|
||||
lists.select_nth_unstable(opts.ivf_nprobe as usize);
|
||||
lists.truncate(opts.ivf_nprobe as usize);
|
||||
let mut result = Vec::new();
|
||||
for i in lists.iter().map(|(_, i)| *i) {
|
||||
let start = self.offsets[i];
|
||||
let end = self.offsets[i + 1];
|
||||
for j in start..end {
|
||||
let payload = self.payloads[j as usize];
|
||||
let distance = self.quantization.distance(&self.storage, vector, j);
|
||||
result.push(Element { distance, payload });
|
||||
}
|
||||
}
|
||||
(result, Box::new(std::iter::empty()))
|
||||
}
|
||||
}
|
||||
|
||||
unsafe impl<O: Op> Send for IvfNaive<O> {}
|
||||
unsafe impl<O: Op> Sync for IvfNaive<O> {}
|
||||
|
||||
pub struct IvfRam<O: Op> {
|
||||
storage: Arc<StorageCollection<O>>,
|
||||
quantization: Quantization<O, StorageCollection<O>>,
|
||||
// ----------------------
|
||||
dims: u32,
|
||||
// ----------------------
|
||||
nlist: u32,
|
||||
// ----------------------
|
||||
centroids: Vec2<Scalar<O>>,
|
||||
ptr: Vec<usize>,
|
||||
payloads: Vec<Payload>,
|
||||
}
|
||||
|
||||
unsafe impl<O: Op> Send for IvfRam<O> {}
|
||||
unsafe impl<O: Op> Sync for IvfRam<O> {}
|
||||
|
||||
pub struct IvfMmap<O: Op> {
|
||||
storage: Arc<StorageCollection<O>>,
|
||||
quantization: Quantization<O, StorageCollection<O>>,
|
||||
// ----------------------
|
||||
dims: u32,
|
||||
// ----------------------
|
||||
nlist: u32,
|
||||
// ----------------------
|
||||
centroids: MmapArray<Scalar<O>>,
|
||||
ptr: MmapArray<usize>,
|
||||
payloads: MmapArray<Payload>,
|
||||
}
|
||||
|
||||
unsafe impl<O: Op> Send for IvfMmap<O> {}
|
||||
unsafe impl<O: Op> Sync for IvfMmap<O> {}
|
||||
|
||||
impl<O: Op> IvfMmap<O> {
|
||||
fn centroids(&self, i: u32) -> &[Scalar<O>] {
|
||||
let s = i as usize * self.dims as usize;
|
||||
let e = (i + 1) as usize * self.dims as usize;
|
||||
&self.centroids[s..e]
|
||||
}
|
||||
}
|
||||
|
||||
pub fn make<O: Op, S: Source<O>>(path: &Path, options: IndexOptions, source: &S) -> IvfRam<O> {
|
||||
let VectorOptions { dims, .. } = options.vector;
|
||||
fn from_nothing<O: Op>(
|
||||
path: impl AsRef<Path>,
|
||||
options: IndexOptions,
|
||||
collection: &impl Collection<O>,
|
||||
) -> IvfNaive<O> {
|
||||
create_dir(path.as_ref()).unwrap();
|
||||
let IvfIndexingOptions {
|
||||
least_iterations,
|
||||
iterations,
|
||||
nlist,
|
||||
nsample,
|
||||
quantization: quantization_opts,
|
||||
quantization: quantization_options,
|
||||
} = options.indexing.clone().unwrap_ivf();
|
||||
let storage = Arc::new(StorageCollection::<O>::create(&path.join("raw"), source));
|
||||
let n = storage.len();
|
||||
let m = std::cmp::min(nsample, n);
|
||||
let f = sample(&mut thread_rng(), n as usize, m as usize).into_vec();
|
||||
let mut samples = Vec2::new(dims, m as usize);
|
||||
for i in 0..m {
|
||||
samples[i as usize].copy_from_slice(storage.vector(f[i as usize] as u32).to_vec().as_ref());
|
||||
O::elkan_k_means_normalize(&mut samples[i as usize]);
|
||||
let samples = common::sample::sample(collection);
|
||||
rayon::check();
|
||||
let centroids = elkan_k_means::<O>(nlist as usize, samples);
|
||||
rayon::check();
|
||||
let mut ls = vec![Vec::new(); nlist as usize];
|
||||
for i in 0..collection.len() {
|
||||
ls[elkan_k_means_lookup::<O>(collection.vector(i), ¢roids)].push(i);
|
||||
}
|
||||
rayon::check();
|
||||
let mut k_means = ElkanKMeans::<O>::new(nlist as usize, samples);
|
||||
for _ in 0..least_iterations {
|
||||
rayon::check();
|
||||
k_means.iterate();
|
||||
}
|
||||
for _ in least_iterations..iterations {
|
||||
rayon::check();
|
||||
if k_means.iterate() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
let centroids = k_means.finish();
|
||||
let mut idx = vec![0usize; n as usize];
|
||||
idx.par_iter_mut().enumerate().for_each(|(i, x)| {
|
||||
rayon::check();
|
||||
let mut vector = storage.vector(i as u32).to_vec();
|
||||
O::elkan_k_means_normalize(&mut vector);
|
||||
let mut result = (F32::infinity(), 0);
|
||||
for i in 0..nlist as usize {
|
||||
let dis = O::elkan_k_means_distance(&vector, ¢roids[i]);
|
||||
result = std::cmp::min(result, (dis, i));
|
||||
}
|
||||
*x = result.1;
|
||||
});
|
||||
let mut invlists_ids = vec![Vec::new(); nlist as usize];
|
||||
let mut invlists_payloads = vec![Vec::new(); nlist as usize];
|
||||
for i in 0..n {
|
||||
invlists_ids[idx[i as usize]].push(i);
|
||||
invlists_payloads[idx[i as usize]].push(storage.payload(i));
|
||||
}
|
||||
rayon::check();
|
||||
let permutation = Vec::from_iter((0..nlist).flat_map(|i| &invlists_ids[i as usize]).copied());
|
||||
rayon::check();
|
||||
let payloads = Vec::from_iter(
|
||||
(0..nlist)
|
||||
.flat_map(|i| &invlists_payloads[i as usize])
|
||||
.copied(),
|
||||
);
|
||||
rayon::check();
|
||||
let quantization = Quantization::create(
|
||||
&path.join("quantization"),
|
||||
options.clone(),
|
||||
quantization_opts,
|
||||
&storage,
|
||||
permutation,
|
||||
);
|
||||
rayon::check();
|
||||
let mut ptr = vec![0usize; nlist as usize + 1];
|
||||
let mut offsets = vec![0u32; nlist as usize + 1];
|
||||
for i in 0..nlist {
|
||||
ptr[i as usize + 1] = ptr[i as usize] + invlists_ids[i as usize].len();
|
||||
offsets[i as usize + 1] = offsets[i as usize] + ls[i as usize].len() as u32;
|
||||
}
|
||||
IvfRam {
|
||||
storage,
|
||||
quantization,
|
||||
centroids,
|
||||
nlist,
|
||||
dims,
|
||||
ptr,
|
||||
payloads,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn save<O: Op>(ram: IvfRam<O>, path: &Path) -> IvfMmap<O> {
|
||||
let centroids = MmapArray::create(
|
||||
&path.join("centroids"),
|
||||
(0..ram.nlist)
|
||||
.flat_map(|i| &ram.centroids[i as usize])
|
||||
.copied(),
|
||||
);
|
||||
let ptr = MmapArray::create(&path.join("ptr"), ram.ptr.iter().copied());
|
||||
let payloads = MmapArray::create(&path.join("payload"), ram.payloads.iter().copied());
|
||||
IvfMmap {
|
||||
storage: ram.storage,
|
||||
quantization: ram.quantization,
|
||||
dims: ram.dims,
|
||||
nlist: ram.nlist,
|
||||
centroids,
|
||||
ptr,
|
||||
payloads,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn open<O: Op>(path: &Path, options: IndexOptions) -> IvfMmap<O> {
|
||||
let storage = Arc::new(StorageCollection::open(&path.join("raw"), options.clone()));
|
||||
let quantization = Quantization::open(
|
||||
&path.join("quantization"),
|
||||
let remap = ls
|
||||
.into_iter()
|
||||
.flat_map(|x| x.into_iter())
|
||||
.collect::<Vec<_>>();
|
||||
let collection = RemappedCollection::from_collection(collection, remap);
|
||||
rayon::check();
|
||||
let storage = O::Storage::create(path.as_ref().join("storage"), &collection);
|
||||
let quantization = Quantization::create(
|
||||
path.as_ref().join("quantization"),
|
||||
options.clone(),
|
||||
options.indexing.clone().unwrap_ivf().quantization,
|
||||
&storage,
|
||||
quantization_options,
|
||||
&collection,
|
||||
);
|
||||
let centroids = MmapArray::open(&path.join("centroids"));
|
||||
let ptr = MmapArray::open(&path.join("ptr"));
|
||||
let payloads = MmapArray::open(&path.join("payload"));
|
||||
let IvfIndexingOptions { nlist, .. } = options.indexing.unwrap_ivf();
|
||||
IvfMmap {
|
||||
let payloads = MmapArray::create(
|
||||
path.as_ref().join("payloads"),
|
||||
(0..collection.len()).map(|i| collection.payload(i)),
|
||||
);
|
||||
let offsets = Json::create(path.as_ref().join("offsets"), offsets);
|
||||
let centroids = Json::create(path.as_ref().join("centroids"), centroids);
|
||||
sync_dir(path);
|
||||
IvfNaive {
|
||||
storage,
|
||||
quantization,
|
||||
dims: options.vector.dims,
|
||||
nlist,
|
||||
centroids,
|
||||
ptr,
|
||||
payloads,
|
||||
offsets,
|
||||
centroids,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn basic<O: Op>(
|
||||
mmap: &IvfMmap<O>,
|
||||
vector: Borrowed<'_, O>,
|
||||
nprobe: u32,
|
||||
) -> BinaryHeap<Reverse<Element>> {
|
||||
let mut target = vector.to_vec();
|
||||
O::elkan_k_means_normalize(&mut target);
|
||||
let mut lists = Vec::with_capacity(mmap.nlist as usize);
|
||||
for i in 0..mmap.nlist {
|
||||
let centroid = mmap.centroids(i);
|
||||
let distance = O::elkan_k_means_distance(&target, centroid);
|
||||
lists.push((distance, i));
|
||||
fn open<O: Op>(path: impl AsRef<Path>) -> IvfNaive<O> {
|
||||
let storage = O::Storage::open(path.as_ref().join("storage"));
|
||||
let quantization = Quantization::open(path.as_ref().join("quantization"));
|
||||
let payloads = MmapArray::open(path.as_ref().join("payloads"));
|
||||
let offsets = Json::open(path.as_ref().join("offsets"));
|
||||
let centroids = Json::open(path.as_ref().join("centroids"));
|
||||
IvfNaive {
|
||||
storage,
|
||||
quantization,
|
||||
payloads,
|
||||
offsets,
|
||||
centroids,
|
||||
}
|
||||
if nprobe < mmap.nlist {
|
||||
lists.select_nth_unstable(nprobe as usize);
|
||||
lists.truncate(nprobe as usize);
|
||||
}
|
||||
let mut result = BinaryHeap::new();
|
||||
for i in lists.iter().map(|(_, i)| *i as usize) {
|
||||
let start = mmap.ptr[i];
|
||||
let end = mmap.ptr[i + 1];
|
||||
for j in start..end {
|
||||
let payload = mmap.payloads[j];
|
||||
let distance = mmap.quantization.distance(vector, j as u32);
|
||||
result.push(Reverse(Element { distance, payload }));
|
||||
}
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
pub fn vbase<'a, O: Op>(
|
||||
mmap: &'a IvfMmap<O>,
|
||||
vector: Borrowed<'a, O>,
|
||||
nprobe: u32,
|
||||
) -> (Vec<Element>, Box<(dyn Iterator<Item = Element> + 'a)>) {
|
||||
let mut target = vector.to_vec();
|
||||
O::elkan_k_means_normalize(&mut target);
|
||||
let mut lists = Vec::with_capacity(mmap.nlist as usize);
|
||||
for i in 0..mmap.nlist {
|
||||
let centroid = mmap.centroids(i);
|
||||
let distance = O::elkan_k_means_distance(&target, centroid);
|
||||
lists.push((distance, i));
|
||||
}
|
||||
if nprobe < mmap.nlist {
|
||||
lists.select_nth_unstable(nprobe as usize);
|
||||
lists.truncate(nprobe as usize);
|
||||
}
|
||||
let mut result = Vec::new();
|
||||
for i in lists.iter().map(|(_, i)| *i as usize) {
|
||||
let start = mmap.ptr[i];
|
||||
let end = mmap.ptr[i + 1];
|
||||
for j in start..end {
|
||||
let payload = mmap.payloads[j];
|
||||
let distance = mmap.quantization.distance(vector, j as u32);
|
||||
result.push(Element { distance, payload });
|
||||
}
|
||||
}
|
||||
(result, Box::new(std::iter::empty()))
|
||||
}
|
||||
|
@ -1,57 +1,53 @@
|
||||
use super::OperatorIvf as Op;
|
||||
use base::distance::*;
|
||||
use base::index::*;
|
||||
use base::operator::*;
|
||||
use base::scalar::*;
|
||||
use base::search::*;
|
||||
use base::vector::*;
|
||||
use common::dir_ops::sync_dir;
|
||||
use common::json::Json;
|
||||
use common::mmap_array::MmapArray;
|
||||
use common::remap::RemappedCollection;
|
||||
use common::vec2::Vec2;
|
||||
use elkan_k_means::ElkanKMeans;
|
||||
use num_traits::{Float, Zero};
|
||||
use quantization::product::operator::OperatorProductQuantization;
|
||||
use rand::seq::index::sample;
|
||||
use rand::thread_rng;
|
||||
use rayon::iter::IndexedParallelIterator;
|
||||
use rayon::iter::IntoParallelRefMutIterator;
|
||||
use rayon::iter::ParallelIterator;
|
||||
use rayon::slice::ParallelSliceMut;
|
||||
use elkan_k_means::elkan_k_means;
|
||||
use elkan_k_means::elkan_k_means_caluate;
|
||||
use elkan_k_means::elkan_k_means_lookup;
|
||||
use quantization::product::ProductQuantizer;
|
||||
use std::cmp::Reverse;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::fs::create_dir;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use storage::StorageCollection;
|
||||
use stoppable_rayon as rayon;
|
||||
use storage::Storage;
|
||||
|
||||
pub struct IvfPq<O: Op> {
|
||||
mmap: IvfMmap<O>,
|
||||
storage: O::Storage,
|
||||
payloads: MmapArray<Payload>,
|
||||
offsets: Json<Vec<u32>>,
|
||||
centroids: Json<Vec2<Scalar<O>>>,
|
||||
train: Json<ProductQuantizer<O>>,
|
||||
codes: MmapArray<u8>,
|
||||
}
|
||||
|
||||
impl<O: Op> IvfPq<O> {
|
||||
pub fn create<S: Source<O>>(path: &Path, options: IndexOptions, source: &S) -> Self {
|
||||
create_dir(path).unwrap();
|
||||
let ram = make(path, options, source);
|
||||
let mmap = save(ram, path);
|
||||
sync_dir(path);
|
||||
Self { mmap }
|
||||
pub fn create(path: impl AsRef<Path>, options: IndexOptions, source: &impl Source<O>) -> Self {
|
||||
let remapped = RemappedCollection::from_source(source);
|
||||
from_nothing(path, options, &remapped)
|
||||
}
|
||||
|
||||
pub fn open(path: &Path, options: IndexOptions) -> Self {
|
||||
let mmap = open(path, options);
|
||||
Self { mmap }
|
||||
pub fn open(path: impl AsRef<Path>) -> Self {
|
||||
open(path)
|
||||
}
|
||||
|
||||
pub fn len(&self) -> u32 {
|
||||
self.mmap.storage.len()
|
||||
self.storage.len()
|
||||
}
|
||||
|
||||
pub fn vector(&self, i: u32) -> Borrowed<'_, O> {
|
||||
self.mmap.storage.vector(i)
|
||||
self.storage.vector(i)
|
||||
}
|
||||
|
||||
pub fn payload(&self, i: u32) -> Payload {
|
||||
self.mmap.storage.payload(i)
|
||||
self.payloads[i as usize]
|
||||
}
|
||||
|
||||
pub fn basic(
|
||||
@ -59,7 +55,27 @@ impl<O: Op> IvfPq<O> {
|
||||
vector: Borrowed<'_, O>,
|
||||
opts: &SearchOptions,
|
||||
) -> BinaryHeap<Reverse<Element>> {
|
||||
basic(&self.mmap, vector, opts.ivf_nprobe)
|
||||
let mut lists = elkan_k_means_caluate::<O>(vector, &self.centroids);
|
||||
lists.select_nth_unstable(opts.ivf_nprobe as usize);
|
||||
lists.truncate(opts.ivf_nprobe as usize);
|
||||
let mut result = BinaryHeap::new();
|
||||
for (_, i) in lists.into_iter() {
|
||||
let start = self.offsets[i];
|
||||
let end = self.offsets[i + 1];
|
||||
let delta = &self.centroids[i];
|
||||
for j in start..end {
|
||||
let payload = self.payloads[j as usize];
|
||||
let distance = {
|
||||
let width = self.train.width();
|
||||
let start = j as usize * width;
|
||||
let end = start + width;
|
||||
self.train
|
||||
.distance_with_delta(vector, &self.codes[start..end], delta)
|
||||
};
|
||||
result.push(Reverse(Element { distance, payload }));
|
||||
}
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
pub fn vbase<'a>(
|
||||
@ -67,511 +83,116 @@ impl<O: Op> IvfPq<O> {
|
||||
vector: Borrowed<'a, O>,
|
||||
opts: &'a SearchOptions,
|
||||
) -> (Vec<Element>, Box<(dyn Iterator<Item = Element> + 'a)>) {
|
||||
vbase(&self.mmap, vector, opts.ivf_nprobe)
|
||||
let mut lists = elkan_k_means_caluate::<O>(vector, &self.centroids);
|
||||
lists.select_nth_unstable(opts.ivf_nprobe as usize);
|
||||
lists.truncate(opts.ivf_nprobe as usize);
|
||||
let mut result = Vec::new();
|
||||
for (_, i) in lists.into_iter() {
|
||||
let start = self.offsets[i];
|
||||
let end = self.offsets[i + 1];
|
||||
let delta = &self.centroids[i];
|
||||
for j in start..end {
|
||||
let payload = self.payloads[j as usize];
|
||||
let distance = {
|
||||
let width = self.train.width();
|
||||
let start = j as usize * width;
|
||||
let end = start + width;
|
||||
self.train
|
||||
.distance_with_delta(vector, &self.codes[start..end], delta)
|
||||
};
|
||||
result.push(Element { distance, payload });
|
||||
}
|
||||
}
|
||||
(result, Box::new(std::iter::empty()))
|
||||
}
|
||||
}
|
||||
|
||||
unsafe impl<O: Op> Send for IvfPq<O> {}
|
||||
unsafe impl<O: Op> Sync for IvfPq<O> {}
|
||||
|
||||
pub struct IvfRam<O: Op> {
|
||||
storage: Arc<StorageCollection<O>>,
|
||||
quantization: ProductQuantization<O>,
|
||||
// ----------------------
|
||||
dims: u32,
|
||||
// ----------------------
|
||||
nlist: u32,
|
||||
// ----------------------
|
||||
centroids: Vec2<Scalar<O>>,
|
||||
ptr: Vec<usize>,
|
||||
payloads: Vec<Payload>,
|
||||
}
|
||||
|
||||
unsafe impl<O: Op> Send for IvfRam<O> {}
|
||||
unsafe impl<O: Op> Sync for IvfRam<O> {}
|
||||
|
||||
pub struct IvfMmap<O: Op> {
|
||||
storage: Arc<StorageCollection<O>>,
|
||||
quantization: ProductQuantization<O>,
|
||||
// ----------------------
|
||||
dims: u32,
|
||||
// ----------------------
|
||||
nlist: u32,
|
||||
// ----------------------
|
||||
centroids: MmapArray<Scalar<O>>,
|
||||
ptr: MmapArray<usize>,
|
||||
payloads: MmapArray<Payload>,
|
||||
}
|
||||
|
||||
unsafe impl<O: Op> Send for IvfMmap<O> {}
|
||||
unsafe impl<O: Op> Sync for IvfMmap<O> {}
|
||||
|
||||
impl<O: Op> IvfMmap<O> {
|
||||
fn centroids(&self, i: u32) -> &[Scalar<O>] {
|
||||
let s = i as usize * self.dims as usize;
|
||||
let e = (i + 1) as usize * self.dims as usize;
|
||||
&self.centroids[s..e]
|
||||
}
|
||||
}
|
||||
|
||||
pub fn make<O: Op, S: Source<O>>(path: &Path, options: IndexOptions, source: &S) -> IvfRam<O> {
|
||||
let VectorOptions { dims, .. } = options.vector;
|
||||
fn from_nothing<O: Op>(
|
||||
path: impl AsRef<Path>,
|
||||
options: IndexOptions,
|
||||
collection: &impl Collection<O>,
|
||||
) -> IvfPq<O> {
|
||||
create_dir(path.as_ref()).unwrap();
|
||||
let IvfIndexingOptions {
|
||||
least_iterations,
|
||||
iterations,
|
||||
nlist,
|
||||
nsample,
|
||||
quantization: quantization_opts,
|
||||
quantization: quantization_options,
|
||||
} = options.indexing.clone().unwrap_ivf();
|
||||
let storage = Arc::new(StorageCollection::<O>::create(&path.join("raw"), source));
|
||||
let n = storage.len();
|
||||
let m = std::cmp::min(nsample, n);
|
||||
let f = sample(&mut thread_rng(), n as usize, m as usize).into_vec();
|
||||
let mut samples = Vec2::new(dims, m as usize);
|
||||
for i in 0..m {
|
||||
samples[i as usize].copy_from_slice(storage.vector(f[i as usize] as u32).to_vec().as_ref());
|
||||
O::elkan_k_means_normalize(&mut samples[i as usize]);
|
||||
}
|
||||
let product_quantization_options = quantization_options.unwrap_product();
|
||||
let samples = common::sample::sample(collection);
|
||||
rayon::check();
|
||||
let mut k_means = ElkanKMeans::<O>::new(nlist as usize, samples);
|
||||
for _ in 0..least_iterations {
|
||||
rayon::check();
|
||||
k_means.iterate();
|
||||
let centroids = elkan_k_means::<O>(nlist as usize, samples);
|
||||
rayon::check();
|
||||
let mut ls = vec![Vec::new(); nlist as usize];
|
||||
for i in 0..collection.len() {
|
||||
ls[elkan_k_means_lookup::<O>(collection.vector(i), ¢roids)].push(i);
|
||||
}
|
||||
for _ in least_iterations..iterations {
|
||||
rayon::check();
|
||||
if k_means.iterate() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
let centroids = k_means.finish();
|
||||
let mut idx = vec![0usize; n as usize];
|
||||
idx.par_iter_mut().enumerate().for_each(|(i, x)| {
|
||||
rayon::check();
|
||||
let mut vector = storage.vector(i as u32).to_vec();
|
||||
O::elkan_k_means_normalize(&mut vector);
|
||||
let mut result = (F32::infinity(), 0);
|
||||
for i in 0..nlist as usize {
|
||||
let dis = O::elkan_k_means_distance(&vector, ¢roids[i]);
|
||||
result = std::cmp::min(result, (dis, i));
|
||||
}
|
||||
*x = result.1;
|
||||
});
|
||||
let mut invlists_ids = vec![Vec::new(); nlist as usize];
|
||||
let mut invlists_payloads = vec![Vec::new(); nlist as usize];
|
||||
for i in 0..n {
|
||||
invlists_ids[idx[i as usize]].push(i);
|
||||
invlists_payloads[idx[i as usize]].push(storage.payload(i));
|
||||
}
|
||||
let mut ptr = vec![0usize; nlist as usize + 1];
|
||||
let mut offsets = vec![0u32; nlist as usize + 1];
|
||||
for i in 0..nlist {
|
||||
ptr[i as usize + 1] = ptr[i as usize] + invlists_ids[i as usize].len();
|
||||
offsets[i as usize + 1] = offsets[i as usize] + ls[i as usize].len() as u32;
|
||||
}
|
||||
let ids = Vec::from_iter((0..nlist).flat_map(|i| &invlists_ids[i as usize]).copied());
|
||||
let payloads = Vec::from_iter(
|
||||
(0..nlist)
|
||||
.flat_map(|i| &invlists_payloads[i as usize])
|
||||
.copied(),
|
||||
);
|
||||
let remap = ls
|
||||
.into_iter()
|
||||
.flat_map(|x| x.into_iter())
|
||||
.collect::<Vec<_>>();
|
||||
let collection = RemappedCollection::from_collection(collection, remap);
|
||||
rayon::check();
|
||||
let residuals = {
|
||||
let mut residuals = Vec2::new(options.vector.dims, n as usize);
|
||||
residuals
|
||||
.par_chunks_mut(dims as usize)
|
||||
.enumerate()
|
||||
.for_each(|(i, v)| {
|
||||
for j in 0..dims {
|
||||
v[j as usize] = storage.vector(ids[i]).to_vec()[j as usize]
|
||||
- centroids[idx[ids[i] as usize]][j as usize];
|
||||
}
|
||||
});
|
||||
residuals
|
||||
};
|
||||
let quantization = ProductQuantization::create(
|
||||
&path.join("quantization"),
|
||||
options.clone(),
|
||||
quantization_opts,
|
||||
&residuals,
|
||||
¢roids,
|
||||
let storage = O::Storage::create(path.as_ref().join("storage"), &collection);
|
||||
let payloads = MmapArray::create(
|
||||
path.as_ref().join("payloads"),
|
||||
(0..collection.len()).map(|i| collection.payload(i)),
|
||||
);
|
||||
IvfRam {
|
||||
let offsets = Json::create(path.as_ref().join("offsets"), offsets);
|
||||
let centroids = Json::create(path.as_ref().join("centroids"), centroids);
|
||||
let train = Json::create(
|
||||
path.as_ref().join("train"),
|
||||
ProductQuantizer::train_transform(
|
||||
options,
|
||||
product_quantization_options,
|
||||
&collection,
|
||||
|v, start, end| {
|
||||
let target = elkan_k_means::elkan_k_means_lookup_dense::<O>(v.to_vec(), ¢roids);
|
||||
for i in start..end {
|
||||
v[i] -= centroids[target][i];
|
||||
}
|
||||
&v[start..end]
|
||||
},
|
||||
),
|
||||
);
|
||||
let codes = MmapArray::create(
|
||||
path.as_ref().join("codes"),
|
||||
(0..collection.len()).flat_map(|i| {
|
||||
let mut v = collection.vector(i).to_vec();
|
||||
let target = elkan_k_means::elkan_k_means_lookup_dense::<O>(v.clone(), ¢roids);
|
||||
for i in 0..collection.dims() as usize {
|
||||
v[i] -= centroids[target][i];
|
||||
}
|
||||
train.encode(&v).into_iter()
|
||||
}),
|
||||
);
|
||||
sync_dir(path);
|
||||
IvfPq {
|
||||
storage,
|
||||
quantization,
|
||||
centroids,
|
||||
nlist,
|
||||
dims,
|
||||
ptr,
|
||||
payloads,
|
||||
offsets,
|
||||
centroids,
|
||||
train,
|
||||
codes,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn save<O: Op>(ram: IvfRam<O>, path: &Path) -> IvfMmap<O> {
|
||||
let centroids = MmapArray::create(
|
||||
&path.join("centroids"),
|
||||
(0..ram.nlist)
|
||||
.flat_map(|i| &ram.centroids[i as usize])
|
||||
.copied(),
|
||||
);
|
||||
let ptr = MmapArray::create(&path.join("ptr"), ram.ptr.iter().copied());
|
||||
let payloads = MmapArray::create(&path.join("payload"), ram.payloads.iter().copied());
|
||||
IvfMmap {
|
||||
storage: ram.storage,
|
||||
quantization: ram.quantization,
|
||||
dims: ram.dims,
|
||||
nlist: ram.nlist,
|
||||
centroids,
|
||||
ptr,
|
||||
payloads,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn open<O: Op>(path: &Path, options: IndexOptions) -> IvfMmap<O> {
|
||||
let storage = Arc::new(StorageCollection::open(&path.join("raw"), options.clone()));
|
||||
let quantization = ProductQuantization::open(
|
||||
&path.join("quantization"),
|
||||
options.clone(),
|
||||
options.indexing.clone().unwrap_ivf().quantization,
|
||||
&storage,
|
||||
);
|
||||
let centroids = MmapArray::open(&path.join("centroids"));
|
||||
let ptr = MmapArray::open(&path.join("ptr"));
|
||||
let payloads = MmapArray::open(&path.join("payload"));
|
||||
let IvfIndexingOptions { nlist, .. } = options.indexing.unwrap_ivf();
|
||||
IvfMmap {
|
||||
fn open<O: Op>(path: impl AsRef<Path>) -> IvfPq<O> {
|
||||
let storage = O::Storage::open(path.as_ref().join("storage"));
|
||||
let payloads = MmapArray::open(path.as_ref().join("payloads"));
|
||||
let offsets = Json::open(path.as_ref().join("offsets"));
|
||||
let centroids = Json::open(path.as_ref().join("centroids"));
|
||||
let train = Json::open(path.as_ref().join("train"));
|
||||
let codes = MmapArray::open(path.as_ref().join("codes"));
|
||||
IvfPq {
|
||||
storage,
|
||||
quantization,
|
||||
dims: options.vector.dims,
|
||||
nlist,
|
||||
centroids,
|
||||
ptr,
|
||||
payloads,
|
||||
offsets,
|
||||
centroids,
|
||||
train,
|
||||
codes,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn basic<O: Op>(
|
||||
mmap: &IvfMmap<O>,
|
||||
vector: Borrowed<'_, O>,
|
||||
nprobe: u32,
|
||||
) -> BinaryHeap<Reverse<Element>> {
|
||||
let dense = vector.to_vec();
|
||||
let mut lists = Vec::with_capacity(mmap.nlist as usize);
|
||||
for i in 0..mmap.nlist {
|
||||
let centroid = mmap.centroids(i);
|
||||
let distance = O::product_quantization_dense_distance(&dense, centroid);
|
||||
lists.push((distance, i));
|
||||
}
|
||||
if nprobe < mmap.nlist {
|
||||
lists.select_nth_unstable(nprobe as usize);
|
||||
lists.truncate(nprobe as usize);
|
||||
}
|
||||
let runtime_table = mmap.quantization.init_query(vector.to_vec().as_ref());
|
||||
let mut result = BinaryHeap::new();
|
||||
for &(coarse_dis, key) in lists.iter() {
|
||||
let start = mmap.ptr[key as usize];
|
||||
let end = mmap.ptr[key as usize + 1];
|
||||
for j in start..end {
|
||||
let payload = mmap.payloads[j];
|
||||
let distance = mmap.quantization.distance_with_codes(
|
||||
vector,
|
||||
j as u32,
|
||||
mmap.centroids(key),
|
||||
key as usize,
|
||||
coarse_dis,
|
||||
&runtime_table,
|
||||
);
|
||||
result.push(Reverse(Element { distance, payload }));
|
||||
}
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
pub fn vbase<'a, O: Op>(
|
||||
mmap: &'a IvfMmap<O>,
|
||||
vector: Borrowed<'a, O>,
|
||||
nprobe: u32,
|
||||
) -> (Vec<Element>, Box<(dyn Iterator<Item = Element> + 'a)>) {
|
||||
let dense = vector.to_vec();
|
||||
let mut lists = Vec::with_capacity(mmap.nlist as usize);
|
||||
for i in 0..mmap.nlist {
|
||||
let centroid = mmap.centroids(i);
|
||||
let distance = O::product_quantization_dense_distance(&dense, centroid);
|
||||
lists.push((distance, i));
|
||||
}
|
||||
if nprobe < mmap.nlist {
|
||||
lists.select_nth_unstable(nprobe as usize);
|
||||
lists.truncate(nprobe as usize);
|
||||
}
|
||||
let runtime_table = mmap.quantization.init_query(vector.to_vec().as_ref());
|
||||
let mut result = Vec::new();
|
||||
for &(coarse_dis, key) in lists.iter() {
|
||||
let start = mmap.ptr[key as usize];
|
||||
let end = mmap.ptr[key as usize + 1];
|
||||
for j in start..end {
|
||||
let payload = mmap.payloads[j];
|
||||
let distance = mmap.quantization.distance_with_codes(
|
||||
vector,
|
||||
j as u32,
|
||||
mmap.centroids(key),
|
||||
key as usize,
|
||||
coarse_dis,
|
||||
&runtime_table,
|
||||
);
|
||||
result.push(Element { distance, payload });
|
||||
}
|
||||
}
|
||||
(result, Box::new(std::iter::empty()))
|
||||
}
|
||||
|
||||
pub struct ProductQuantization<O: Op> {
|
||||
dims: u32,
|
||||
ratio: u32,
|
||||
centroids: Vec<Scalar<O>>,
|
||||
codes: MmapArray<u8>,
|
||||
precomputed_table: Vec<F32>,
|
||||
}
|
||||
|
||||
unsafe impl<O: Op> Send for ProductQuantization<O> {}
|
||||
unsafe impl<O: Op> Sync for ProductQuantization<O> {}
|
||||
|
||||
impl<O: Op> ProductQuantization<O> {
|
||||
pub fn codes(&self, i: u32) -> &[u8] {
|
||||
let width = self.dims.div_ceil(self.ratio);
|
||||
let s = i as usize * width as usize;
|
||||
let e = (i + 1) as usize * width as usize;
|
||||
&self.codes[s..e]
|
||||
}
|
||||
pub fn open(
|
||||
path: &Path,
|
||||
options: IndexOptions,
|
||||
quantization_options: QuantizationOptions,
|
||||
_: &Arc<StorageCollection<O>>,
|
||||
) -> Self {
|
||||
let QuantizationOptions::Product(quantization_options) = quantization_options else {
|
||||
unreachable!()
|
||||
};
|
||||
let centroids =
|
||||
serde_json::from_slice(&std::fs::read(path.join("centroids")).unwrap()).unwrap();
|
||||
let codes = MmapArray::open(&path.join("codes"));
|
||||
let precomputed_table =
|
||||
serde_json::from_slice(&std::fs::read(path.join("table")).unwrap()).unwrap();
|
||||
Self {
|
||||
dims: options.vector.dims,
|
||||
ratio: quantization_options.ratio as _,
|
||||
centroids,
|
||||
codes,
|
||||
precomputed_table,
|
||||
}
|
||||
}
|
||||
pub fn create(
|
||||
path: &Path,
|
||||
options: IndexOptions,
|
||||
quantization_options: QuantizationOptions,
|
||||
v2: &Vec2<Scalar<O>>,
|
||||
coarse_centroids: &Vec2<Scalar<O>>,
|
||||
) -> Self {
|
||||
create_dir(path).unwrap();
|
||||
let QuantizationOptions::Product(quantization_options) = quantization_options else {
|
||||
unreachable!()
|
||||
};
|
||||
let dims = options.vector.dims;
|
||||
let ratio = quantization_options.ratio as u32;
|
||||
let n = v2.len();
|
||||
let m = std::cmp::min(n, quantization_options.sample as usize);
|
||||
let samples = {
|
||||
let f = sample(&mut thread_rng(), n, m).into_vec();
|
||||
let mut samples = Vec2::new(dims, m);
|
||||
for i in 0..m {
|
||||
samples[i].copy_from_slice(&v2[f[i]]);
|
||||
}
|
||||
samples
|
||||
};
|
||||
let width = dims.div_ceil(ratio);
|
||||
// a temp layout (width * 256 * subdims) for par_chunks_mut
|
||||
let mut tmp_centroids = vec![Scalar::<O>::zero(); 256 * dims as usize];
|
||||
// this par_for parallelizes over sub quantizers
|
||||
tmp_centroids
|
||||
.par_chunks_mut(256 * ratio as usize)
|
||||
.enumerate()
|
||||
.for_each(|(i, v)| {
|
||||
// i is the index of subquantizer
|
||||
let subdims = std::cmp::min(ratio, dims - ratio * i as u32) as usize;
|
||||
let mut subsamples = Vec2::new(subdims as u32, m);
|
||||
for j in 0..m {
|
||||
let src = &samples[j][i * ratio as usize..][..subdims];
|
||||
subsamples[j].copy_from_slice(src);
|
||||
}
|
||||
let mut k_means = ElkanKMeans::<O::ProductQuantizationL2>::new(256, subsamples);
|
||||
for _ in 0..25 {
|
||||
if k_means.iterate() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
let centroid = k_means.finish();
|
||||
for j in 0usize..=255 {
|
||||
v[j * subdims..][..subdims].copy_from_slice(¢roid[j]);
|
||||
}
|
||||
});
|
||||
// transform back to normal layout (256 * width * subdims)
|
||||
let mut centroids = vec![Scalar::<O>::zero(); 256 * dims as usize];
|
||||
centroids
|
||||
.par_chunks_mut(dims as usize)
|
||||
.enumerate()
|
||||
.for_each(|(i, v)| {
|
||||
for j in 0..width {
|
||||
let subdims = std::cmp::min(ratio, dims - ratio * j) as usize;
|
||||
v[(j * ratio) as usize..][..subdims].copy_from_slice(
|
||||
&tmp_centroids[(j * ratio) as usize * 256..][i * subdims..][..subdims],
|
||||
);
|
||||
}
|
||||
});
|
||||
let mut codes = vec![0u8; n * width as usize];
|
||||
codes
|
||||
.par_chunks_mut(width as usize)
|
||||
.enumerate()
|
||||
.for_each(|(id, v)| {
|
||||
let vector = v2[id].to_vec();
|
||||
let width = dims.div_ceil(ratio);
|
||||
for i in 0..width {
|
||||
let subdims = std::cmp::min(ratio, dims - ratio * i);
|
||||
let mut minimal = F32::infinity();
|
||||
let mut target = 0u8;
|
||||
let left = &vector[(i * ratio) as usize..][..subdims as usize];
|
||||
for j in 0u8..=255 {
|
||||
let right = ¢roids[j as usize * dims as usize..]
|
||||
[(i * ratio) as usize..][..subdims as usize];
|
||||
let dis = O::ProductQuantizationL2::product_quantization_dense_distance(
|
||||
left, right,
|
||||
);
|
||||
if dis < minimal {
|
||||
minimal = dis;
|
||||
target = j;
|
||||
}
|
||||
}
|
||||
v[i as usize] = target;
|
||||
}
|
||||
});
|
||||
sync_dir(path);
|
||||
std::fs::write(
|
||||
path.join("centroids"),
|
||||
serde_json::to_string(¢roids).unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
let codes = MmapArray::create(&path.join("codes"), codes.into_iter());
|
||||
// precompute_table
|
||||
let nlist = coarse_centroids.len();
|
||||
let width = dims.div_ceil(ratio);
|
||||
let mut precomputed_table = Vec::new();
|
||||
precomputed_table.resize(nlist * width as usize * 256, F32::zero());
|
||||
precomputed_table
|
||||
.par_chunks_mut(width as usize * 256)
|
||||
.enumerate()
|
||||
.for_each(|(i, v)| {
|
||||
let x_c = &coarse_centroids[i];
|
||||
for j in 0..width {
|
||||
let subdims = std::cmp::min(ratio, dims - ratio * j);
|
||||
let sub_x_c = &x_c[(j * ratio) as usize..][..subdims as usize];
|
||||
for k in 0usize..256 {
|
||||
let sub_x_r = ¢roids[k * dims as usize..][(j * ratio) as usize..]
|
||||
[..subdims as usize];
|
||||
v[j as usize * 256 + k] = squared_norm::<O>(subdims, sub_x_r)
|
||||
+ F32(2.0) * inner_product::<O>(subdims, sub_x_c, sub_x_r);
|
||||
}
|
||||
}
|
||||
});
|
||||
std::fs::write(
|
||||
path.join("table"),
|
||||
serde_json::to_string(&precomputed_table).unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
Self {
|
||||
dims,
|
||||
ratio,
|
||||
centroids,
|
||||
codes,
|
||||
precomputed_table,
|
||||
}
|
||||
}
|
||||
|
||||
// compute term2 at query time
|
||||
pub fn init_query(&self, query: &[Scalar<O>]) -> Vec<F32> {
|
||||
match O::DISTANCE_KIND {
|
||||
DistanceKind::Cos => Vec::new(),
|
||||
DistanceKind::L2 | DistanceKind::Dot | DistanceKind::Jaccard => {
|
||||
let dims = self.dims;
|
||||
let ratio = self.ratio;
|
||||
let width = dims.div_ceil(ratio);
|
||||
let mut runtime_table = vec![F32::zero(); width as usize * 256];
|
||||
for i in 0..256 {
|
||||
for j in 0..width {
|
||||
let subdims = std::cmp::min(ratio, dims - ratio * j);
|
||||
let sub_query = &query[(j * ratio) as usize..][..subdims as usize];
|
||||
let centroid = &self.centroids[i * dims as usize..][(j * ratio) as usize..]
|
||||
[..subdims as usize];
|
||||
runtime_table[j as usize * 256 + i] =
|
||||
F32(-1.0) * inner_product::<O>(subdims, sub_query, centroid);
|
||||
}
|
||||
}
|
||||
runtime_table
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// add up all terms given codes
|
||||
pub fn distance_with_codes(
|
||||
&self,
|
||||
lhs: Borrowed<'_, O>,
|
||||
rhs: u32,
|
||||
delta: &[Scalar<O>],
|
||||
key: usize,
|
||||
coarse_dis: F32,
|
||||
runtime_table: &[F32],
|
||||
) -> F32 {
|
||||
let codes = self.codes(rhs);
|
||||
let width = self.dims.div_ceil(self.ratio);
|
||||
let precomputed_table = &self.precomputed_table[key * width as usize * 256..];
|
||||
match O::DISTANCE_KIND {
|
||||
DistanceKind::Cos => self.distance_with_delta(lhs, rhs, delta),
|
||||
DistanceKind::L2 => {
|
||||
let mut result = coarse_dis;
|
||||
for i in 0..width {
|
||||
result += precomputed_table[i as usize * 256 + codes[i as usize] as usize]
|
||||
+ F32(2.0) * runtime_table[i as usize * 256 + codes[i as usize] as usize];
|
||||
}
|
||||
result
|
||||
}
|
||||
DistanceKind::Dot => {
|
||||
let mut result = coarse_dis;
|
||||
for i in 0..width {
|
||||
result += runtime_table[i as usize * 256 + codes[i as usize] as usize];
|
||||
}
|
||||
result
|
||||
}
|
||||
DistanceKind::Jaccard => {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn distance_with_delta(&self, lhs: Borrowed<'_, O>, rhs: u32, delta: &[Scalar<O>]) -> F32 {
|
||||
let dims = self.dims;
|
||||
let ratio = self.ratio;
|
||||
let rhs = self.codes(rhs);
|
||||
O::product_quantization_distance_with_delta(dims, ratio, &self.centroids, lhs, rhs, delta)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn squared_norm<O: Op>(dims: u32, vec: &[Scalar<O>]) -> F32 {
|
||||
let mut result = F32::zero();
|
||||
for i in 0..dims as usize {
|
||||
result += F32((vec[i] * vec[i]).to_f32());
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
pub fn inner_product<O: Op>(dims: u32, lhs: &[Scalar<O>], rhs: &[Scalar<O>]) -> F32 {
|
||||
let mut result = F32::zero();
|
||||
for i in 0..dims as usize {
|
||||
result += F32((lhs[i] * rhs[i]).to_f32());
|
||||
}
|
||||
result
|
||||
}
|
||||
|
@ -1,4 +1,3 @@
|
||||
#![feature(trait_alias)]
|
||||
#![allow(clippy::len_without_is_empty)]
|
||||
#![allow(clippy::needless_range_loop)]
|
||||
|
||||
@ -6,18 +5,25 @@ pub mod ivf_naive;
|
||||
pub mod ivf_pq;
|
||||
|
||||
use self::ivf_naive::IvfNaive;
|
||||
use self::ivf_pq::IvfPq;
|
||||
use base::index::*;
|
||||
use base::operator::*;
|
||||
use base::search::*;
|
||||
use common::dir_ops::sync_dir;
|
||||
use common::variants::variants;
|
||||
use elkan_k_means::operator::OperatorElkanKMeans;
|
||||
use ivf_pq::IvfPq;
|
||||
use quantization::operator::OperatorQuantization;
|
||||
use std::cmp::Reverse;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::path::Path;
|
||||
use storage::operator::OperatorStorage;
|
||||
use storage::OperatorStorage;
|
||||
|
||||
pub trait OperatorIvf = Operator + OperatorElkanKMeans + OperatorQuantization + OperatorStorage;
|
||||
pub trait OperatorIvf:
|
||||
Operator + OperatorElkanKMeans + OperatorQuantization + OperatorStorage
|
||||
{
|
||||
}
|
||||
|
||||
impl<T: Operator + OperatorElkanKMeans + OperatorQuantization + OperatorStorage> OperatorIvf for T {}
|
||||
|
||||
pub enum Ivf<O: OperatorIvf> {
|
||||
Naive(IvfNaive<O>),
|
||||
@ -25,25 +31,30 @@ pub enum Ivf<O: OperatorIvf> {
|
||||
}
|
||||
|
||||
impl<O: OperatorIvf> Ivf<O> {
|
||||
pub fn create<S: Source<O>>(path: &Path, options: IndexOptions, source: &S) -> Self {
|
||||
if matches!(
|
||||
options.indexing.clone().unwrap_ivf().quantization,
|
||||
QuantizationOptions::Product(_)
|
||||
) {
|
||||
Self::Pq(IvfPq::create(path, options, source))
|
||||
pub fn create(path: impl AsRef<Path>, options: IndexOptions, source: &impl Source<O>) -> Self {
|
||||
let IvfIndexingOptions {
|
||||
quantization: quantization_options,
|
||||
..
|
||||
} = options.indexing.clone().unwrap_ivf();
|
||||
std::fs::create_dir(path.as_ref()).unwrap();
|
||||
let this = if matches!(quantization_options, QuantizationOptions::Product(_)) {
|
||||
Self::Pq(IvfPq::create(path.as_ref().join("ivf_pq"), options, source))
|
||||
} else {
|
||||
Self::Naive(IvfNaive::create(path, options, source))
|
||||
}
|
||||
Self::Naive(IvfNaive::create(
|
||||
path.as_ref().join("ivf_naive"),
|
||||
options,
|
||||
source,
|
||||
))
|
||||
};
|
||||
sync_dir(path);
|
||||
this
|
||||
}
|
||||
|
||||
pub fn open(path: &Path, options: IndexOptions) -> Self {
|
||||
if matches!(
|
||||
options.indexing.clone().unwrap_ivf().quantization,
|
||||
QuantizationOptions::Product(_)
|
||||
) {
|
||||
Self::Pq(IvfPq::open(path, options))
|
||||
} else {
|
||||
Self::Naive(IvfNaive::open(path, options))
|
||||
pub fn open(path: impl AsRef<Path>) -> Self {
|
||||
match variants(path.as_ref(), ["ivf_naive", "ivf_pq"]) {
|
||||
"ivf_naive" => Self::Naive(IvfNaive::open(path.as_ref().join("naive"))),
|
||||
"ivf_pq" => todo!(),
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,13 +1,11 @@
|
||||
#![feature(thread_local)]
|
||||
|
||||
use std::os::fd::OwnedFd;
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
pub fn memfd_create() -> std::io::Result<OwnedFd> {
|
||||
use std::cell::Cell;
|
||||
#[thread_local]
|
||||
static SUPPORT_MEMFD: Cell<bool> = Cell::new(true);
|
||||
if SUPPORT_MEMFD.get() {
|
||||
use std::sync::atomic::AtomicBool;
|
||||
use std::sync::atomic::Ordering;
|
||||
static SUPPORT_MEMFD: AtomicBool = AtomicBool::new(true);
|
||||
if SUPPORT_MEMFD.load(Ordering::Relaxed) {
|
||||
use rustix::fs::MemfdFlags;
|
||||
let r = rustix::fs::memfd_create(
|
||||
format!(".memfd.MEMFD.{:x}", std::process::id()),
|
||||
@ -18,7 +16,7 @@ pub fn memfd_create() -> std::io::Result<OwnedFd> {
|
||||
return Ok(fd);
|
||||
}
|
||||
Err(e) if e.kind() == std::io::ErrorKind::Unsupported => {
|
||||
SUPPORT_MEMFD.set(false);
|
||||
SUPPORT_MEMFD.store(false, Ordering::Relaxed);
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(e.into());
|
||||
|
@ -6,6 +6,7 @@ edition.workspace = true
|
||||
[dependencies]
|
||||
num-traits.workspace = true
|
||||
rand.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
|
||||
base = { path = "../base" }
|
||||
|
@ -1,104 +1,111 @@
|
||||
#![feature(doc_cfg)]
|
||||
#![feature(avx512_target_feature)]
|
||||
|
||||
pub mod operator;
|
||||
pub mod product;
|
||||
pub mod scalar;
|
||||
pub mod trivial;
|
||||
|
||||
use self::product::ProductQuantization;
|
||||
use self::scalar::ScalarQuantization;
|
||||
use self::trivial::TrivialQuantization;
|
||||
use self::product::ProductQuantizer;
|
||||
use self::scalar::ScalarQuantizer;
|
||||
use crate::operator::OperatorQuantization;
|
||||
use base::index::*;
|
||||
use base::operator::*;
|
||||
use base::scalar::*;
|
||||
use base::search::*;
|
||||
use base::vector::*;
|
||||
use common::dir_ops::sync_dir;
|
||||
use common::json::Json;
|
||||
use common::mmap_array::MmapArray;
|
||||
use serde::Deserialize;
|
||||
use serde::Serialize;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub enum Quantization<O: OperatorQuantization, C: Collection<O>> {
|
||||
Trivial(TrivialQuantization<O, C>),
|
||||
Scalar(ScalarQuantization<O, C>),
|
||||
Product(ProductQuantization<O, C>),
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(bound = "")]
|
||||
pub enum Quantizer<O: OperatorQuantization> {
|
||||
Trivial,
|
||||
Scalar(ScalarQuantizer<O>),
|
||||
Product(ProductQuantizer<O>),
|
||||
}
|
||||
|
||||
impl<O: OperatorQuantization, C: Collection<O>> Quantization<O, C> {
|
||||
pub fn create(
|
||||
path: &Path,
|
||||
impl<O: OperatorQuantization> Quantizer<O> {
|
||||
pub fn train(
|
||||
options: IndexOptions,
|
||||
quantization_options: QuantizationOptions,
|
||||
collection: &Arc<C>,
|
||||
permutation: Vec<u32>, // permutation is the mapping from placements to original ids
|
||||
vectors: &impl Vectors<O>,
|
||||
) -> Self {
|
||||
use QuantizationOptions::*;
|
||||
match quantization_options {
|
||||
QuantizationOptions::Trivial(_) => Self::Trivial(TrivialQuantization::create(
|
||||
path,
|
||||
options,
|
||||
quantization_options,
|
||||
collection,
|
||||
permutation,
|
||||
)),
|
||||
QuantizationOptions::Scalar(_) => Self::Scalar(ScalarQuantization::create(
|
||||
path,
|
||||
options,
|
||||
quantization_options,
|
||||
collection,
|
||||
permutation,
|
||||
)),
|
||||
QuantizationOptions::Product(_) => Self::Product(ProductQuantization::create(
|
||||
path,
|
||||
options,
|
||||
quantization_options,
|
||||
collection,
|
||||
permutation,
|
||||
)),
|
||||
Trivial(_) => Self::Trivial,
|
||||
Scalar(_) => Self::Scalar(ScalarQuantizer::train(options, vectors)),
|
||||
Product(product) => Self::Product(ProductQuantizer::train(options, product, vectors)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn open(
|
||||
path: &Path,
|
||||
options: IndexOptions,
|
||||
quantization_options: QuantizationOptions,
|
||||
collection: &Arc<C>,
|
||||
) -> Self {
|
||||
match quantization_options {
|
||||
QuantizationOptions::Trivial(_) => Self::Trivial(TrivialQuantization::open(
|
||||
path,
|
||||
options,
|
||||
quantization_options,
|
||||
collection,
|
||||
)),
|
||||
QuantizationOptions::Scalar(_) => Self::Scalar(ScalarQuantization::open(
|
||||
path,
|
||||
options,
|
||||
quantization_options,
|
||||
collection,
|
||||
)),
|
||||
QuantizationOptions::Product(_) => Self::Product(ProductQuantization::open(
|
||||
path,
|
||||
options,
|
||||
quantization_options,
|
||||
collection,
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn distance(&self, lhs: Borrowed<'_, O>, rhs: u32) -> F32 {
|
||||
use Quantization::*;
|
||||
pub fn width(&self) -> usize {
|
||||
use Quantizer::*;
|
||||
match self {
|
||||
Trivial(x) => x.distance(lhs, rhs),
|
||||
Trivial => 0,
|
||||
Scalar(x) => x.width(),
|
||||
Product(x) => x.width(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode(&self, vector: &[Scalar<O>]) -> Vec<u8> {
|
||||
use Quantizer::*;
|
||||
match self {
|
||||
Trivial => Vec::new(),
|
||||
Scalar(x) => x.encode(vector),
|
||||
Product(x) => x.encode(vector),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn distance(&self, fallback: impl Fn() -> F32, lhs: Borrowed<'_, O>, rhs: &[u8]) -> F32 {
|
||||
use Quantizer::*;
|
||||
match self {
|
||||
Trivial => fallback(),
|
||||
Scalar(x) => x.distance(lhs, rhs),
|
||||
Product(x) => x.distance(lhs, rhs),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn distance2(&self, lhs: u32, rhs: u32) -> F32 {
|
||||
use Quantization::*;
|
||||
match self {
|
||||
Trivial(x) => x.distance2(lhs, rhs),
|
||||
Scalar(x) => x.distance2(lhs, rhs),
|
||||
Product(x) => x.distance2(lhs, rhs),
|
||||
}
|
||||
pub struct Quantization<O: OperatorQuantization> {
|
||||
train: Json<Quantizer<O>>,
|
||||
codes: MmapArray<u8>,
|
||||
}
|
||||
|
||||
impl<O: OperatorQuantization> Quantization<O> {
|
||||
pub fn create(
|
||||
path: impl AsRef<Path>,
|
||||
options: IndexOptions,
|
||||
quantization_options: QuantizationOptions,
|
||||
vectors: &impl Vectors<O>,
|
||||
) -> Self {
|
||||
std::fs::create_dir(path.as_ref()).unwrap();
|
||||
let train = Quantizer::train(options, quantization_options, vectors);
|
||||
let train = Json::create(path.as_ref().join("train"), train);
|
||||
let codes = MmapArray::create(
|
||||
path.as_ref().join("codes"),
|
||||
(0..vectors.len()).flat_map(|i| train.encode(&vectors.vector(i).to_vec()).into_iter()),
|
||||
);
|
||||
sync_dir(path);
|
||||
Self { train, codes }
|
||||
}
|
||||
|
||||
pub fn open(path: impl AsRef<Path>) -> Self {
|
||||
let train = Json::open(path.as_ref().join("train"));
|
||||
let codes = MmapArray::open(path.as_ref().join("codes"));
|
||||
Self { train, codes }
|
||||
}
|
||||
|
||||
pub fn distance(&self, vectors: &impl Vectors<O>, lhs: Borrowed<'_, O>, rhs: u32) -> F32 {
|
||||
let width = self.train.width();
|
||||
let start = rhs as usize * width;
|
||||
let end = start + width;
|
||||
self.train.distance(
|
||||
|| O::distance(lhs, vectors.vector(rhs)),
|
||||
lhs,
|
||||
&self.codes[start..end],
|
||||
)
|
||||
}
|
||||
}
|
||||
|
@ -5,155 +5,132 @@ use base::index::*;
|
||||
use base::operator::*;
|
||||
use base::scalar::*;
|
||||
use base::search::*;
|
||||
use base::vector::*;
|
||||
use common::dir_ops::sync_dir;
|
||||
use common::mmap_array::MmapArray;
|
||||
use common::sample::sample_subvector;
|
||||
use common::sample::sample_subvector_transform;
|
||||
use common::vec2::Vec2;
|
||||
use elkan_k_means::ElkanKMeans;
|
||||
use num_traits::{Float, Zero};
|
||||
use rand::seq::index::sample;
|
||||
use rand::thread_rng;
|
||||
use std::marker::PhantomData;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use elkan_k_means::elkan_k_means;
|
||||
use num_traits::Float;
|
||||
use serde::Deserialize;
|
||||
use serde::Serialize;
|
||||
|
||||
pub struct ProductQuantization<O: OperatorProductQuantization, C: Collection<O>> {
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(bound = "")]
|
||||
pub struct ProductQuantizer<O: OperatorProductQuantization> {
|
||||
dims: u32,
|
||||
ratio: u32,
|
||||
centroids: Vec<Scalar<O>>,
|
||||
codes: MmapArray<u8>,
|
||||
_maker: PhantomData<fn(C) -> C>,
|
||||
centroids: Vec2<Scalar<O>>,
|
||||
}
|
||||
|
||||
unsafe impl<O: OperatorProductQuantization, C: Collection<O>> Send for ProductQuantization<O, C> {}
|
||||
unsafe impl<O: OperatorProductQuantization, C: Collection<O>> Sync for ProductQuantization<O, C> {}
|
||||
|
||||
impl<O: OperatorProductQuantization, C: Collection<O>> ProductQuantization<O, C> {
|
||||
fn codes(&self, i: u32) -> &[u8] {
|
||||
let width = self.dims.div_ceil(self.ratio);
|
||||
let s = i as usize * width as usize;
|
||||
let e = (i + 1) as usize * width as usize;
|
||||
&self.codes[s..e]
|
||||
}
|
||||
}
|
||||
|
||||
impl<O: OperatorProductQuantization, C: Collection<O>> ProductQuantization<O, C> {
|
||||
pub fn create(
|
||||
path: &Path,
|
||||
impl<O: OperatorProductQuantization> ProductQuantizer<O> {
|
||||
pub fn train(
|
||||
options: IndexOptions,
|
||||
quantization_options: QuantizationOptions,
|
||||
collection: &Arc<C>,
|
||||
permutation: Vec<u32>, // permutation is the mapping from placements to original ids
|
||||
product_quantization_options: ProductQuantizationOptions,
|
||||
vectors: &impl Vectors<O>,
|
||||
) -> Self {
|
||||
std::fs::create_dir(path).unwrap();
|
||||
let QuantizationOptions::Product(quantization_options) = quantization_options else {
|
||||
unreachable!()
|
||||
};
|
||||
let dims = options.vector.dims;
|
||||
let ratio = quantization_options.ratio as u32;
|
||||
let n = collection.len();
|
||||
let m = std::cmp::min(n, quantization_options.sample);
|
||||
let samples = {
|
||||
let f = sample(&mut thread_rng(), n as usize, m as usize).into_vec();
|
||||
let mut samples = Vec2::<Scalar<O>>::new(dims, m as usize);
|
||||
for i in 0..m {
|
||||
samples[i as usize]
|
||||
.copy_from_slice(collection.vector(f[i as usize] as u32).to_vec().as_ref());
|
||||
}
|
||||
samples
|
||||
};
|
||||
let ratio = product_quantization_options.ratio as u32;
|
||||
let width = dims.div_ceil(ratio);
|
||||
let mut centroids = vec![Scalar::<O>::zero(); 256 * dims as usize];
|
||||
let mut centroids = Vec2::new(dims, 256);
|
||||
for i in 0..width {
|
||||
let subdims = std::cmp::min(ratio, dims - ratio * i);
|
||||
let mut subsamples = Vec2::<Scalar<O>>::new(subdims, m as usize);
|
||||
for j in 0..m {
|
||||
let src = &samples[j as usize][(i * ratio) as usize..][..subdims as usize];
|
||||
subsamples[j as usize].copy_from_slice(src);
|
||||
}
|
||||
let mut k_means = ElkanKMeans::<O::ProductQuantizationL2>::new(256, subsamples);
|
||||
for _ in 0..25 {
|
||||
if k_means.iterate() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
let centroid = k_means.finish();
|
||||
let start = (i * ratio) as usize;
|
||||
let end = start + subdims as usize;
|
||||
let subsamples = sample_subvector(vectors, start, end);
|
||||
let centroid = elkan_k_means::<O::PQL2>(256, subsamples);
|
||||
for j in 0u8..=255 {
|
||||
centroids[j as usize * dims as usize..][(i * ratio) as usize..][..subdims as usize]
|
||||
centroids[j as usize][(i * ratio) as usize..][..subdims as usize]
|
||||
.copy_from_slice(¢roid[j as usize]);
|
||||
}
|
||||
}
|
||||
let codes_iter = (0..n).flat_map(|i| {
|
||||
let vector = collection.vector(permutation[i as usize]).to_vec();
|
||||
let width = dims.div_ceil(ratio);
|
||||
let mut result = Vec::with_capacity(width as usize);
|
||||
for i in 0..width {
|
||||
let subdims = std::cmp::min(ratio, dims - ratio * i);
|
||||
let mut minimal = F32::infinity();
|
||||
let mut target = 0u8;
|
||||
let left = &vector[(i * ratio) as usize..][..subdims as usize];
|
||||
for j in 0u8..=255 {
|
||||
let right = ¢roids[j as usize * dims as usize..][(i * ratio) as usize..]
|
||||
[..subdims as usize];
|
||||
let dis = O::product_quantization_l2_distance(left, right);
|
||||
if dis < minimal {
|
||||
minimal = dis;
|
||||
target = j;
|
||||
}
|
||||
}
|
||||
result.push(target);
|
||||
}
|
||||
result.into_iter()
|
||||
});
|
||||
sync_dir(path);
|
||||
std::fs::write(
|
||||
path.join("centroids"),
|
||||
serde_json::to_string(¢roids).unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
let codes = MmapArray::create(&path.join("codes"), codes_iter);
|
||||
Self {
|
||||
dims,
|
||||
ratio,
|
||||
centroids,
|
||||
codes,
|
||||
_maker: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn open(
|
||||
path: &Path,
|
||||
pub fn train_transform(
|
||||
options: IndexOptions,
|
||||
quantization_options: QuantizationOptions,
|
||||
_: &Arc<C>,
|
||||
product_quantization_options: ProductQuantizationOptions,
|
||||
vectors: &impl Vectors<O>,
|
||||
transform_subvector: impl Fn(&mut [Scalar<O>], usize, usize) -> &[Scalar<O>],
|
||||
) -> Self {
|
||||
let QuantizationOptions::Product(quantization_options) = quantization_options else {
|
||||
unreachable!()
|
||||
};
|
||||
let centroids =
|
||||
serde_json::from_slice(&std::fs::read(path.join("centroids")).unwrap()).unwrap();
|
||||
let codes = MmapArray::open(&path.join("codes"));
|
||||
let dims = options.vector.dims;
|
||||
let ratio = product_quantization_options.ratio as u32;
|
||||
let width = dims.div_ceil(ratio);
|
||||
let mut centroids = Vec2::new(dims, 256);
|
||||
for i in 0..width {
|
||||
let subdims = std::cmp::min(ratio, dims - ratio * i);
|
||||
let start = (i * ratio) as usize;
|
||||
let end = start + subdims as usize;
|
||||
let subsamples = sample_subvector_transform(vectors, start, end, |v| {
|
||||
transform_subvector(v, start, end)
|
||||
});
|
||||
let centroid = elkan_k_means::<O::PQL2>(256, subsamples);
|
||||
for j in 0u8..=255 {
|
||||
centroids[j as usize][(i * ratio) as usize..][..subdims as usize]
|
||||
.copy_from_slice(¢roid[j as usize]);
|
||||
}
|
||||
}
|
||||
Self {
|
||||
dims: options.vector.dims,
|
||||
ratio: quantization_options.ratio as _,
|
||||
dims,
|
||||
ratio,
|
||||
centroids,
|
||||
codes,
|
||||
_maker: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn distance(&self, lhs: Borrowed<'_, O>, rhs: u32) -> F32 {
|
||||
#[inline(always)]
|
||||
pub fn width(&self) -> usize {
|
||||
self.dims as usize
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn ratio(&self) -> usize {
|
||||
self.ratio as usize
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn centroids(&self) -> &Vec2<Scalar<O>> {
|
||||
&self.centroids
|
||||
}
|
||||
|
||||
pub fn encode(&self, vector: &[Scalar<O>]) -> Vec<u8> {
|
||||
let dims = self.dims;
|
||||
let ratio = self.ratio;
|
||||
let width = dims.div_ceil(ratio);
|
||||
let mut result = Vec::with_capacity(width as usize);
|
||||
for i in 0..width {
|
||||
let subdims = std::cmp::min(ratio, dims - ratio * i);
|
||||
let mut minimal = F32::infinity();
|
||||
let mut target = 0u8;
|
||||
let left = &vector[(i * ratio) as usize..][..subdims as usize];
|
||||
for j in 0u8..=255 {
|
||||
let right = &self.centroids[j as usize][(i * ratio) as usize..][..subdims as usize];
|
||||
let dis = O::dense_l2_distance(left, right);
|
||||
if dis < minimal {
|
||||
minimal = dis;
|
||||
target = j;
|
||||
}
|
||||
}
|
||||
result.push(target);
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
pub fn distance(&self, lhs: Borrowed<'_, O>, rhs: &[u8]) -> F32 {
|
||||
let dims = self.dims;
|
||||
let ratio = self.ratio;
|
||||
let rhs = self.codes(rhs);
|
||||
O::product_quantization_distance(dims, ratio, &self.centroids, lhs, rhs)
|
||||
}
|
||||
|
||||
pub fn distance2(&self, lhs: u32, rhs: u32) -> F32 {
|
||||
pub fn distance_with_delta(
|
||||
&self,
|
||||
lhs: Borrowed<'_, O>,
|
||||
rhs: &[u8],
|
||||
delta: &[Scalar<O>],
|
||||
) -> F32 {
|
||||
let dims = self.dims;
|
||||
let ratio = self.ratio;
|
||||
let lhs = self.codes(lhs);
|
||||
let rhs = self.codes(rhs);
|
||||
O::product_quantization_distance2(dims, ratio, &self.centroids, lhs, rhs)
|
||||
O::product_quantization_distance_with_delta(dims, ratio, &self.centroids, lhs, rhs, delta)
|
||||
}
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -4,100 +4,52 @@ use self::operator::OperatorScalarQuantization;
|
||||
use base::index::*;
|
||||
use base::operator::*;
|
||||
use base::scalar::*;
|
||||
use base::search::Collection;
|
||||
use base::search::Vectors;
|
||||
use base::vector::*;
|
||||
use common::dir_ops::sync_dir;
|
||||
use common::mmap_array::MmapArray;
|
||||
use num_traits::Float;
|
||||
use std::marker::PhantomData;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use serde::Deserialize;
|
||||
use serde::Serialize;
|
||||
|
||||
pub struct ScalarQuantization<O: OperatorScalarQuantization, C: Collection<O>> {
|
||||
dims: u16,
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(bound = "")]
|
||||
pub struct ScalarQuantizer<O: OperatorScalarQuantization> {
|
||||
dims: u32,
|
||||
max: Vec<Scalar<O>>,
|
||||
min: Vec<Scalar<O>>,
|
||||
codes: MmapArray<u8>,
|
||||
_maker: PhantomData<fn(C) -> C>,
|
||||
}
|
||||
|
||||
unsafe impl<O: OperatorScalarQuantization, C: Collection<O>> Send for ScalarQuantization<O, C> {}
|
||||
unsafe impl<O: OperatorScalarQuantization, C: Collection<O>> Sync for ScalarQuantization<O, C> {}
|
||||
|
||||
impl<O: OperatorScalarQuantization, C: Collection<O>> ScalarQuantization<O, C> {
|
||||
fn codes(&self, i: u32) -> &[u8] {
|
||||
let s = i as usize * self.dims as usize;
|
||||
let e = (i + 1) as usize * self.dims as usize;
|
||||
&self.codes[s..e]
|
||||
}
|
||||
}
|
||||
|
||||
impl<O: OperatorScalarQuantization, C: Collection<O>> ScalarQuantization<O, C> {
|
||||
pub fn create(
|
||||
path: &Path,
|
||||
options: IndexOptions,
|
||||
_: QuantizationOptions,
|
||||
collection: &Arc<C>,
|
||||
permutation: Vec<u32>, // permutation is the mapping from placements to original ids
|
||||
) -> Self {
|
||||
std::fs::create_dir(path).unwrap();
|
||||
let dims: u16 = options.vector.dims.try_into().unwrap();
|
||||
impl<O: OperatorScalarQuantization> ScalarQuantizer<O> {
|
||||
pub fn train(options: IndexOptions, vectors: &impl Vectors<O>) -> Self {
|
||||
let dims = options.vector.dims;
|
||||
let mut max = vec![Scalar::<O>::neg_infinity(); dims as usize];
|
||||
let mut min = vec![Scalar::<O>::infinity(); dims as usize];
|
||||
let n = collection.len();
|
||||
let n = vectors.len();
|
||||
for i in 0..n {
|
||||
let vector = collection.vector(permutation[i as usize]).to_vec();
|
||||
let vector = vectors.vector(i).to_vec();
|
||||
for j in 0..dims as usize {
|
||||
max[j] = std::cmp::max(max[j], vector[j]);
|
||||
min[j] = std::cmp::min(min[j], vector[j]);
|
||||
}
|
||||
}
|
||||
std::fs::write(path.join("max"), serde_json::to_string(&max).unwrap()).unwrap();
|
||||
std::fs::write(path.join("min"), serde_json::to_string(&min).unwrap()).unwrap();
|
||||
let codes_iter = (0..n).flat_map(|i| {
|
||||
let vector = collection.vector(permutation[i as usize]).to_vec();
|
||||
let mut result = vec![0u8; dims as usize];
|
||||
for i in 0..dims as usize {
|
||||
let w = (((vector[i] - min[i]) / (max[i] - min[i])).to_f32() * 256.0) as u32;
|
||||
result[i] = w.clamp(0, 255) as u8;
|
||||
}
|
||||
result.into_iter()
|
||||
});
|
||||
let codes = MmapArray::create(&path.join("codes"), codes_iter);
|
||||
sync_dir(path);
|
||||
Self {
|
||||
dims,
|
||||
max,
|
||||
min,
|
||||
codes,
|
||||
_maker: PhantomData,
|
||||
}
|
||||
Self { dims, max, min }
|
||||
}
|
||||
|
||||
pub fn open(path: &Path, options: IndexOptions, _: QuantizationOptions, _: &Arc<C>) -> Self {
|
||||
let dims: u16 = options.vector.dims.try_into().unwrap();
|
||||
let max = serde_json::from_slice(&std::fs::read("max").unwrap()).unwrap();
|
||||
let min = serde_json::from_slice(&std::fs::read("min").unwrap()).unwrap();
|
||||
let codes = MmapArray::open(&path.join("codes"));
|
||||
Self {
|
||||
dims,
|
||||
max,
|
||||
min,
|
||||
codes,
|
||||
_maker: PhantomData,
|
||||
}
|
||||
pub fn width(&self) -> usize {
|
||||
self.dims as usize
|
||||
}
|
||||
|
||||
pub fn distance(&self, lhs: Borrowed<'_, O>, rhs: u32) -> F32 {
|
||||
pub fn encode(&self, vector: &[Scalar<O>]) -> Vec<u8> {
|
||||
let dims = self.dims;
|
||||
let rhs = self.codes(rhs);
|
||||
O::scalar_quantization_distance(dims, &self.max, &self.min, lhs, rhs)
|
||||
let mut result = vec![0u8; dims as usize];
|
||||
for i in 0..dims as usize {
|
||||
let w =
|
||||
(((vector[i] - self.min[i]) / (self.max[i] - self.min[i])).to_f32() * 256.0) as u32;
|
||||
result[i] = w.clamp(0, 255) as u8;
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
pub fn distance2(&self, lhs: u32, rhs: u32) -> F32 {
|
||||
let dims = self.dims;
|
||||
let lhs = self.codes(lhs);
|
||||
let rhs = self.codes(rhs);
|
||||
O::scalar_quantization_distance2(dims, &self.max, &self.min, lhs, rhs)
|
||||
pub fn distance(&self, lhs: Borrowed<'_, O>, rhs: &[u8]) -> F32 {
|
||||
O::scalar_quantization_distance(self.dims as _, &self.max, &self.min, lhs, rhs)
|
||||
}
|
||||
}
|
||||
|
@ -11,13 +11,6 @@ pub trait OperatorScalarQuantization: Operator {
|
||||
lhs: Borrowed<'_, Self>,
|
||||
rhs: &[u8],
|
||||
) -> F32;
|
||||
fn scalar_quantization_distance2(
|
||||
dims: u16,
|
||||
max: &[Scalar<Self>],
|
||||
min: &[Scalar<Self>],
|
||||
lhs: &[u8],
|
||||
rhs: &[u8],
|
||||
) -> F32;
|
||||
}
|
||||
|
||||
impl OperatorScalarQuantization for BVecf32Cos {
|
||||
@ -30,16 +23,6 @@ impl OperatorScalarQuantization for BVecf32Cos {
|
||||
) -> F32 {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn scalar_quantization_distance2(
|
||||
_dims: u16,
|
||||
_max: &[Scalar<Self>],
|
||||
_min: &[Scalar<Self>],
|
||||
_lhs: &[u8],
|
||||
_rhs: &[u8],
|
||||
) -> F32 {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
impl OperatorScalarQuantization for BVecf32Dot {
|
||||
@ -52,16 +35,6 @@ impl OperatorScalarQuantization for BVecf32Dot {
|
||||
) -> F32 {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn scalar_quantization_distance2(
|
||||
_dims: u16,
|
||||
_max: &[Scalar<Self>],
|
||||
_min: &[Scalar<Self>],
|
||||
_lhs: &[u8],
|
||||
_rhs: &[u8],
|
||||
) -> F32 {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
impl OperatorScalarQuantization for BVecf32Jaccard {
|
||||
@ -74,16 +47,6 @@ impl OperatorScalarQuantization for BVecf32Jaccard {
|
||||
) -> F32 {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn scalar_quantization_distance2(
|
||||
_dims: u16,
|
||||
_max: &[Scalar<Self>],
|
||||
_min: &[Scalar<Self>],
|
||||
_lhs: &[u8],
|
||||
_rhs: &[u8],
|
||||
) -> F32 {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
impl OperatorScalarQuantization for BVecf32L2 {
|
||||
@ -96,16 +59,6 @@ impl OperatorScalarQuantization for BVecf32L2 {
|
||||
) -> F32 {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn scalar_quantization_distance2(
|
||||
_dims: u16,
|
||||
_max: &[Scalar<Self>],
|
||||
_min: &[Scalar<Self>],
|
||||
_lhs: &[u8],
|
||||
_rhs: &[u8],
|
||||
) -> F32 {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
impl OperatorScalarQuantization for SVecf32Cos {
|
||||
@ -118,16 +71,6 @@ impl OperatorScalarQuantization for SVecf32Cos {
|
||||
) -> F32 {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn scalar_quantization_distance2(
|
||||
_dims: u16,
|
||||
_max: &[Scalar<Self>],
|
||||
_min: &[Scalar<Self>],
|
||||
_lhs: &[u8],
|
||||
_rhs: &[u8],
|
||||
) -> F32 {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
impl OperatorScalarQuantization for SVecf32Dot {
|
||||
@ -140,16 +83,6 @@ impl OperatorScalarQuantization for SVecf32Dot {
|
||||
) -> F32 {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn scalar_quantization_distance2(
|
||||
_dims: u16,
|
||||
_max: &[Scalar<Self>],
|
||||
_min: &[Scalar<Self>],
|
||||
_lhs: &[u8],
|
||||
_rhs: &[u8],
|
||||
) -> F32 {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
impl OperatorScalarQuantization for SVecf32L2 {
|
||||
@ -162,16 +95,6 @@ impl OperatorScalarQuantization for SVecf32L2 {
|
||||
) -> F32 {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn scalar_quantization_distance2(
|
||||
_dims: u16,
|
||||
_max: &[Scalar<Self>],
|
||||
_min: &[Scalar<Self>],
|
||||
_lhs: &[u8],
|
||||
_rhs: &[u8],
|
||||
) -> F32 {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
impl OperatorScalarQuantization for Vecf16Cos {
|
||||
@ -196,27 +119,6 @@ impl OperatorScalarQuantization for Vecf16Cos {
|
||||
}
|
||||
F32(1.0) - xy / (x2 * y2).sqrt()
|
||||
}
|
||||
|
||||
#[detect::multiversion(v4, v3, v2, neon, fallback)]
|
||||
fn scalar_quantization_distance2(
|
||||
dims: u16,
|
||||
max: &[F16],
|
||||
min: &[F16],
|
||||
lhs: &[u8],
|
||||
rhs: &[u8],
|
||||
) -> F32 {
|
||||
let mut xy = F32::zero();
|
||||
let mut x2 = F32::zero();
|
||||
let mut y2 = F32::zero();
|
||||
for i in 0..dims as usize {
|
||||
let _x = F32(lhs[i] as f32 / 256.0) * (max[i].to_f() - min[i].to_f()) + min[i].to_f();
|
||||
let _y = F32(rhs[i] as f32 / 256.0) * (max[i].to_f() - min[i].to_f()) + min[i].to_f();
|
||||
xy += _x * _y;
|
||||
x2 += _x * _x;
|
||||
y2 += _y * _y;
|
||||
}
|
||||
F32(1.0) - xy / (x2 * y2).sqrt()
|
||||
}
|
||||
}
|
||||
|
||||
impl OperatorScalarQuantization for Vecf16Dot {
|
||||
@ -237,23 +139,6 @@ impl OperatorScalarQuantization for Vecf16Dot {
|
||||
}
|
||||
xy * (-1.0)
|
||||
}
|
||||
|
||||
#[detect::multiversion(v4, v3, v2, neon, fallback)]
|
||||
fn scalar_quantization_distance2(
|
||||
dims: u16,
|
||||
max: &[F16],
|
||||
min: &[F16],
|
||||
lhs: &[u8],
|
||||
rhs: &[u8],
|
||||
) -> F32 {
|
||||
let mut xy = F32::zero();
|
||||
for i in 0..dims as usize {
|
||||
let _x = F32(lhs[i] as f32 / 256.0) * (max[i].to_f() - min[i].to_f()) + min[i].to_f();
|
||||
let _y = F32(rhs[i] as f32 / 256.0) * (max[i].to_f() - min[i].to_f()) + min[i].to_f();
|
||||
xy += _x * _y;
|
||||
}
|
||||
xy * (-1.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl OperatorScalarQuantization for Vecf16L2 {
|
||||
@ -274,23 +159,6 @@ impl OperatorScalarQuantization for Vecf16L2 {
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
#[detect::multiversion(v4, v3, v2, neon, fallback)]
|
||||
fn scalar_quantization_distance2(
|
||||
dims: u16,
|
||||
max: &[F16],
|
||||
min: &[F16],
|
||||
lhs: &[u8],
|
||||
rhs: &[u8],
|
||||
) -> F32 {
|
||||
let mut result = F32::zero();
|
||||
for i in 0..dims as usize {
|
||||
let _x = F32(lhs[i] as f32 / 256.0) * (max[i].to_f() - min[i].to_f()) + min[i].to_f();
|
||||
let _y = F32(rhs[i] as f32 / 256.0) * (max[i].to_f() - min[i].to_f()) + min[i].to_f();
|
||||
result += (_x - _y) * (_x - _y);
|
||||
}
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
impl OperatorScalarQuantization for Vecf32Cos {
|
||||
@ -315,27 +183,6 @@ impl OperatorScalarQuantization for Vecf32Cos {
|
||||
}
|
||||
F32(1.0) - xy / (x2 * y2).sqrt()
|
||||
}
|
||||
|
||||
#[detect::multiversion(v4, v3, v2, neon, fallback)]
|
||||
fn scalar_quantization_distance2(
|
||||
dims: u16,
|
||||
max: &[F32],
|
||||
min: &[F32],
|
||||
lhs: &[u8],
|
||||
rhs: &[u8],
|
||||
) -> F32 {
|
||||
let mut xy = F32::zero();
|
||||
let mut x2 = F32::zero();
|
||||
let mut y2 = F32::zero();
|
||||
for i in 0..dims as usize {
|
||||
let _x = F32(lhs[i] as f32 / 256.0) * (max[i] - min[i]) + min[i];
|
||||
let _y = F32(rhs[i] as f32 / 256.0) * (max[i] - min[i]) + min[i];
|
||||
xy += _x * _y;
|
||||
x2 += _x * _x;
|
||||
y2 += _y * _y;
|
||||
}
|
||||
F32(1.0) - xy / (x2 * y2).sqrt()
|
||||
}
|
||||
}
|
||||
|
||||
impl OperatorScalarQuantization for Vecf32Dot {
|
||||
@ -356,23 +203,6 @@ impl OperatorScalarQuantization for Vecf32Dot {
|
||||
}
|
||||
xy * (-1.0)
|
||||
}
|
||||
|
||||
#[detect::multiversion(v4, v3, v2, neon, fallback)]
|
||||
fn scalar_quantization_distance2(
|
||||
dims: u16,
|
||||
max: &[F32],
|
||||
min: &[F32],
|
||||
lhs: &[u8],
|
||||
rhs: &[u8],
|
||||
) -> F32 {
|
||||
let mut xy = F32::zero();
|
||||
for i in 0..dims as usize {
|
||||
let _x = F32(lhs[i] as f32 / 256.0) * (max[i] - min[i]) + min[i];
|
||||
let _y = F32(rhs[i] as f32 / 256.0) * (max[i] - min[i]) + min[i];
|
||||
xy += _x * _y;
|
||||
}
|
||||
xy * (-1.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl OperatorScalarQuantization for Vecf32L2 {
|
||||
@ -393,23 +223,6 @@ impl OperatorScalarQuantization for Vecf32L2 {
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
#[detect::multiversion(v4, v3, v2, neon, fallback)]
|
||||
fn scalar_quantization_distance2(
|
||||
dims: u16,
|
||||
max: &[F32],
|
||||
min: &[F32],
|
||||
lhs: &[u8],
|
||||
rhs: &[u8],
|
||||
) -> F32 {
|
||||
let mut result = F32::zero();
|
||||
for i in 0..dims as usize {
|
||||
let _x = F32(lhs[i] as f32 / 256.0) * (max[i] - min[i]) + min[i];
|
||||
let _y = F32(rhs[i] as f32 / 256.0) * (max[i] - min[i]) + min[i];
|
||||
result += (_x - _y) * (_x - _y);
|
||||
}
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
impl OperatorScalarQuantization for Veci8Cos {
|
||||
@ -422,15 +235,6 @@ impl OperatorScalarQuantization for Veci8Cos {
|
||||
) -> F32 {
|
||||
unimplemented!()
|
||||
}
|
||||
fn scalar_quantization_distance2(
|
||||
_dims: u16,
|
||||
_max: &[Scalar<Self>],
|
||||
_min: &[Scalar<Self>],
|
||||
_lhs: &[u8],
|
||||
_rhs: &[u8],
|
||||
) -> F32 {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
impl OperatorScalarQuantization for Veci8Dot {
|
||||
@ -443,15 +247,6 @@ impl OperatorScalarQuantization for Veci8Dot {
|
||||
) -> F32 {
|
||||
unimplemented!()
|
||||
}
|
||||
fn scalar_quantization_distance2(
|
||||
_dims: u16,
|
||||
_max: &[Scalar<Self>],
|
||||
_min: &[Scalar<Self>],
|
||||
_lhs: &[u8],
|
||||
_rhs: &[u8],
|
||||
) -> F32 {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
impl OperatorScalarQuantization for Veci8L2 {
|
||||
@ -464,13 +259,4 @@ impl OperatorScalarQuantization for Veci8L2 {
|
||||
) -> F32 {
|
||||
unimplemented!()
|
||||
}
|
||||
fn scalar_quantization_distance2(
|
||||
_dims: u16,
|
||||
_max: &[Scalar<Self>],
|
||||
_min: &[Scalar<Self>],
|
||||
_lhs: &[u8],
|
||||
_rhs: &[u8],
|
||||
) -> F32 {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
@ -1,63 +0,0 @@
|
||||
use base::index::*;
|
||||
use base::operator::*;
|
||||
use base::scalar::*;
|
||||
use base::search::Collection;
|
||||
use common::dir_ops::sync_dir;
|
||||
use std::marker::PhantomData;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub struct TrivialQuantization<O: Operator, C: Collection<O>> {
|
||||
collection: Arc<C>,
|
||||
permutation: Vec<u32>,
|
||||
_maker: PhantomData<fn(O) -> O>,
|
||||
}
|
||||
|
||||
impl<O: Operator, C: Collection<O>> TrivialQuantization<O, C> {
|
||||
fn codes(&self, i: u32) -> Borrowed<'_, O> {
|
||||
self.collection.vector(self.permutation[i as usize])
|
||||
}
|
||||
}
|
||||
|
||||
impl<O: Operator, C: Collection<O>> TrivialQuantization<O, C> {
|
||||
// permutation is the mapping from placements to original ids
|
||||
pub fn create(
|
||||
path: &Path,
|
||||
_: IndexOptions,
|
||||
_: QuantizationOptions,
|
||||
collection: &Arc<C>,
|
||||
permutation: Vec<u32>,
|
||||
) -> Self {
|
||||
// here we cannot modify origin, so we record permutation for translation
|
||||
std::fs::create_dir(path).unwrap();
|
||||
sync_dir(path);
|
||||
std::fs::write(
|
||||
path.join("permutation"),
|
||||
serde_json::to_string(&permutation).unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
Self {
|
||||
collection: collection.clone(),
|
||||
permutation,
|
||||
_maker: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn open(path: &Path, _: IndexOptions, _: QuantizationOptions, collection: &Arc<C>) -> Self {
|
||||
let permutation =
|
||||
serde_json::from_slice(&std::fs::read(path.join("permutation")).unwrap()).unwrap();
|
||||
Self {
|
||||
collection: collection.clone(),
|
||||
permutation,
|
||||
_maker: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn distance(&self, lhs: Borrowed<'_, O>, rhs: u32) -> F32 {
|
||||
O::distance(lhs, self.codes(rhs))
|
||||
}
|
||||
|
||||
pub fn distance2(&self, lhs: u32, rhs: u32) -> F32 {
|
||||
O::distance(self.codes(lhs), self.codes(rhs))
|
||||
}
|
||||
}
|
@ -211,6 +211,26 @@ impl Instance {
|
||||
Instance::Veci8Dot(x) => x.alter(key, value),
|
||||
}
|
||||
}
|
||||
pub fn delete(&self, pointer: Pointer) -> Result<(), DeleteError> {
|
||||
match self {
|
||||
Instance::Vecf32Cos(x) => x.delete(pointer),
|
||||
Instance::Vecf32Dot(x) => x.delete(pointer),
|
||||
Instance::Vecf32L2(x) => x.delete(pointer),
|
||||
Instance::Vecf16Cos(x) => x.delete(pointer),
|
||||
Instance::Vecf16Dot(x) => x.delete(pointer),
|
||||
Instance::Vecf16L2(x) => x.delete(pointer),
|
||||
Instance::SVecf32Cos(x) => x.delete(pointer),
|
||||
Instance::SVecf32Dot(x) => x.delete(pointer),
|
||||
Instance::SVecf32L2(x) => x.delete(pointer),
|
||||
Instance::BVecf32Cos(x) => x.delete(pointer),
|
||||
Instance::BVecf32Dot(x) => x.delete(pointer),
|
||||
Instance::BVecf32L2(x) => x.delete(pointer),
|
||||
Instance::BVecf32Jaccard(x) => x.delete(pointer),
|
||||
Instance::Veci8Cos(x) => x.delete(pointer),
|
||||
Instance::Veci8Dot(x) => x.delete(pointer),
|
||||
Instance::Veci8L2(x) => x.delete(pointer),
|
||||
}
|
||||
}
|
||||
pub fn start(&self) {
|
||||
match self {
|
||||
Instance::Vecf32Cos(x) => x.start(),
|
||||
@ -475,26 +495,6 @@ impl InstanceView {
|
||||
_ => Err(InsertError::InvalidVector),
|
||||
}
|
||||
}
|
||||
pub fn delete(&self, pointer: Pointer) -> Result<(), DeleteError> {
|
||||
match self {
|
||||
InstanceView::Vecf32Cos(x) => x.delete(pointer),
|
||||
InstanceView::Vecf32Dot(x) => x.delete(pointer),
|
||||
InstanceView::Vecf32L2(x) => x.delete(pointer),
|
||||
InstanceView::Vecf16Cos(x) => x.delete(pointer),
|
||||
InstanceView::Vecf16Dot(x) => x.delete(pointer),
|
||||
InstanceView::Vecf16L2(x) => x.delete(pointer),
|
||||
InstanceView::SVecf32Cos(x) => x.delete(pointer),
|
||||
InstanceView::SVecf32Dot(x) => x.delete(pointer),
|
||||
InstanceView::SVecf32L2(x) => x.delete(pointer),
|
||||
InstanceView::BVecf32Cos(x) => x.delete(pointer),
|
||||
InstanceView::BVecf32Dot(x) => x.delete(pointer),
|
||||
InstanceView::BVecf32L2(x) => x.delete(pointer),
|
||||
InstanceView::BVecf32Jaccard(x) => x.delete(pointer),
|
||||
InstanceView::Veci8Cos(x) => x.delete(pointer),
|
||||
InstanceView::Veci8Dot(x) => x.delete(pointer),
|
||||
InstanceView::Veci8L2(x) => x.delete(pointer),
|
||||
}
|
||||
}
|
||||
pub fn flush(&self) -> Result<(), FlushError> {
|
||||
match self {
|
||||
InstanceView::Vecf32Cos(x) => x.flush(),
|
||||
|
@ -1,6 +1,3 @@
|
||||
#![feature(trait_alias)]
|
||||
#![cfg_attr(target_arch = "x86_64", feature(stdarch_x86_avx512))]
|
||||
|
||||
mod instance;
|
||||
mod version;
|
||||
mod worker;
|
||||
|
@ -18,7 +18,7 @@ pub struct Version {
|
||||
}
|
||||
|
||||
impl Version {
|
||||
const VERSION: u64 = 4;
|
||||
const VERSION: u64 = 6;
|
||||
const SOFT_VERSION: u64 = 0;
|
||||
}
|
||||
|
||||
|
@ -164,8 +164,7 @@ impl WorkerOperations for Worker {
|
||||
fn delete(&self, handle: Handle, pointer: Pointer) -> Result<(), DeleteError> {
|
||||
let view = self.view();
|
||||
let instance = view.get(handle).ok_or(DeleteError::NotExist)?;
|
||||
let view = instance.view();
|
||||
view.delete(pointer)?;
|
||||
instance.delete(pointer)?;
|
||||
Ok(())
|
||||
}
|
||||
fn view_basic(&self, handle: Handle) -> Result<impl ViewBasicOperations, BasicError> {
|
||||
|
@ -1,11 +1,11 @@
|
||||
[package]
|
||||
name = "rayon"
|
||||
name = "stoppable_rayon"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
|
||||
[dependencies]
|
||||
log.workspace = true
|
||||
rayoff = { package = "rayon", version = "1.8.1" }
|
||||
rayon = { package = "rayon", version = "1.8.1" }
|
||||
|
||||
[lints]
|
||||
workspace = true
|
@ -1,6 +1,3 @@
|
||||
#![feature(thread_local)]
|
||||
|
||||
use rayoff as rayon;
|
||||
use std::cell::RefCell;
|
||||
use std::panic::AssertUnwindSafe;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
@ -110,17 +107,20 @@ impl<'a> ThreadPool<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
#[thread_local]
|
||||
static STOP: RefCell<Option<Arc<AtomicBool>>> = RefCell::new(None);
|
||||
std::thread_local! {
|
||||
static STOP: RefCell<Option<Arc<AtomicBool>>> = const { RefCell::new(None) };
|
||||
}
|
||||
|
||||
struct CheckPanic;
|
||||
|
||||
pub fn check() {
|
||||
if let Some(stop) = STOP.borrow().as_ref() {
|
||||
if stop.load(Ordering::Relaxed) {
|
||||
std::panic::panic_any(CheckPanic);
|
||||
STOP.with(|stop| {
|
||||
if let Some(stop) = stop.borrow().as_ref() {
|
||||
if stop.load(Ordering::Relaxed) {
|
||||
std::panic::panic_any(CheckPanic);
|
||||
}
|
||||
} else {
|
||||
panic!("`check` is called outside rayon")
|
||||
}
|
||||
} else {
|
||||
panic!("`check` is called outside rayon")
|
||||
}
|
||||
});
|
||||
}
|
@ -6,6 +6,8 @@ edition.workspace = true
|
||||
[dependencies]
|
||||
base = { path = "../base" }
|
||||
common = { path = "../common" }
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
@ -1,62 +1,51 @@
|
||||
use crate::Storage;
|
||||
pub use base::index::*;
|
||||
use base::operator::Operator;
|
||||
pub use base::search::*;
|
||||
pub use base::vector::*;
|
||||
use base::search::*;
|
||||
use base::vector::*;
|
||||
use common::json::Json;
|
||||
use common::mmap_array::MmapArray;
|
||||
use std::path::Path;
|
||||
|
||||
pub struct BVectorStorage {
|
||||
vectors: MmapArray<usize>,
|
||||
payload: MmapArray<Payload>,
|
||||
dims: u16,
|
||||
dims: Json<u32>,
|
||||
len: Json<u32>,
|
||||
slice: MmapArray<usize>,
|
||||
}
|
||||
|
||||
impl Storage for BVectorStorage {
|
||||
type VectorOwned = BVecf32Owned;
|
||||
|
||||
impl<O: Operator<VectorOwned = BVecf32Owned>> Vectors<O> for BVectorStorage {
|
||||
fn dims(&self) -> u32 {
|
||||
self.dims as u32
|
||||
*self.dims
|
||||
}
|
||||
|
||||
fn len(&self) -> u32 {
|
||||
self.payload.len() as u32
|
||||
*self.len
|
||||
}
|
||||
|
||||
fn vector(&self, i: u32) -> BVecf32Borrowed<'_> {
|
||||
let size = (self.dims as usize).div_ceil(BVEC_WIDTH);
|
||||
let size = (*self.dims as usize).div_ceil(BVEC_WIDTH);
|
||||
let s = i as usize * size;
|
||||
let e = (i + 1) as usize * size;
|
||||
BVecf32Borrowed::new(self.dims, &self.vectors[s..e])
|
||||
}
|
||||
|
||||
fn payload(&self, i: u32) -> Payload {
|
||||
self.payload[i as usize]
|
||||
}
|
||||
|
||||
fn open(path: &Path, options: IndexOptions) -> Self {
|
||||
let vectors = MmapArray::open(&path.join("vectors"));
|
||||
let payload = MmapArray::open(&path.join("payload"));
|
||||
Self {
|
||||
vectors,
|
||||
payload,
|
||||
dims: options.vector.dims.try_into().unwrap(),
|
||||
}
|
||||
}
|
||||
|
||||
fn save<O: Operator<VectorOwned = Self::VectorOwned>, C: Collection<O>>(
|
||||
path: &Path,
|
||||
collection: &C,
|
||||
) -> Self {
|
||||
let n = collection.len();
|
||||
let vectors_iter = (0..n).flat_map(|i| collection.vector(i).data().iter().copied());
|
||||
let payload_iter = (0..n).map(|i| collection.payload(i));
|
||||
let vectors = MmapArray::create(&path.join("vectors"), vectors_iter);
|
||||
let payload = MmapArray::create(&path.join("payload"), payload_iter);
|
||||
Self {
|
||||
vectors,
|
||||
payload,
|
||||
dims: collection.dims().try_into().unwrap(),
|
||||
}
|
||||
BVecf32Borrowed::new(*self.dims as _, &self.slice[s..e])
|
||||
}
|
||||
}
|
||||
|
||||
impl<O: Operator<VectorOwned = BVecf32Owned>> Storage<O> for BVectorStorage {
|
||||
fn create(path: impl AsRef<Path>, vectors: &impl Vectors<O>) -> Self {
|
||||
std::fs::create_dir(path.as_ref()).unwrap();
|
||||
let dims = Json::create(path.as_ref().join("dims"), vectors.dims());
|
||||
let len = Json::create(path.as_ref().join("len"), vectors.len());
|
||||
let slice = MmapArray::create(
|
||||
path.as_ref().join("slice"),
|
||||
(0..*len).flat_map(|i| vectors.vector(i).data().iter().copied()),
|
||||
);
|
||||
common::dir_ops::sync_dir(path);
|
||||
Self { dims, len, slice }
|
||||
}
|
||||
|
||||
fn open(path: impl AsRef<Path>) -> Self {
|
||||
let dims = Json::open(path.as_ref().join("dims"));
|
||||
let len = Json::open(path.as_ref().join("len"));
|
||||
let slice = MmapArray::open(path.as_ref().join("slice"));
|
||||
Self { dims, len, slice }
|
||||
}
|
||||
}
|
||||
|
@ -1,68 +1,82 @@
|
||||
#![allow(clippy::len_without_is_empty)]
|
||||
|
||||
mod bvector;
|
||||
pub mod operator;
|
||||
mod svec;
|
||||
pub mod vec;
|
||||
mod vec;
|
||||
mod veci8;
|
||||
|
||||
use self::operator::OperatorStorage;
|
||||
use base::index::*;
|
||||
use base::operator::*;
|
||||
use base::scalar::*;
|
||||
use base::search::*;
|
||||
use base::vector::*;
|
||||
use std::path::Path;
|
||||
|
||||
pub trait Storage {
|
||||
type VectorOwned: VectorOwned;
|
||||
|
||||
fn dims(&self) -> u32;
|
||||
fn len(&self) -> u32;
|
||||
fn vector(&self, i: u32) -> <Self::VectorOwned as VectorOwned>::Borrowed<'_>;
|
||||
fn payload(&self, i: u32) -> Payload;
|
||||
fn open(path: &Path, options: IndexOptions) -> Self;
|
||||
fn save<O: Operator<VectorOwned = Self::VectorOwned>, C: Collection<O>>(
|
||||
path: &Path,
|
||||
collection: &C,
|
||||
) -> Self;
|
||||
pub trait Storage<O: Operator>: Vectors<O> {
|
||||
fn open(path: impl AsRef<Path>) -> Self;
|
||||
fn create(path: impl AsRef<Path>, vectors: &impl Vectors<O>) -> Self;
|
||||
}
|
||||
|
||||
pub struct StorageCollection<O: OperatorStorage> {
|
||||
storage: O::Storage,
|
||||
pub trait OperatorStorage: Operator {
|
||||
type Storage: Storage<Self> + Send + Sync;
|
||||
}
|
||||
|
||||
impl<O: OperatorStorage> StorageCollection<O> {
|
||||
pub fn create<C: Collection<O>>(path: &Path, source: &C) -> Self {
|
||||
std::fs::create_dir(path).unwrap();
|
||||
let storage = O::Storage::save(path, source);
|
||||
common::dir_ops::sync_dir(path);
|
||||
Self { storage }
|
||||
}
|
||||
|
||||
pub fn open(path: &Path, options: IndexOptions) -> Self {
|
||||
Self {
|
||||
storage: O::Storage::open(path, options),
|
||||
}
|
||||
}
|
||||
impl OperatorStorage for SVecf32Cos {
|
||||
type Storage = svec::SVecStorage;
|
||||
}
|
||||
|
||||
impl<O: OperatorStorage> Collection<O> for StorageCollection<O> {
|
||||
fn dims(&self) -> u32 {
|
||||
self.storage.dims()
|
||||
}
|
||||
|
||||
fn len(&self) -> u32 {
|
||||
self.storage.len()
|
||||
}
|
||||
|
||||
fn vector(&self, i: u32) -> Borrowed<'_, O> {
|
||||
self.storage.vector(i)
|
||||
}
|
||||
|
||||
fn payload(&self, i: u32) -> Payload {
|
||||
self.storage.payload(i)
|
||||
}
|
||||
impl OperatorStorage for SVecf32Dot {
|
||||
type Storage = svec::SVecStorage;
|
||||
}
|
||||
|
||||
unsafe impl<O: OperatorStorage> Send for StorageCollection<O> {}
|
||||
unsafe impl<O: OperatorStorage> Sync for StorageCollection<O> {}
|
||||
impl OperatorStorage for SVecf32L2 {
|
||||
type Storage = svec::SVecStorage;
|
||||
}
|
||||
|
||||
impl OperatorStorage for Vecf16Cos {
|
||||
type Storage = vec::VecStorage<F16>;
|
||||
}
|
||||
|
||||
impl OperatorStorage for Vecf16Dot {
|
||||
type Storage = vec::VecStorage<F16>;
|
||||
}
|
||||
|
||||
impl OperatorStorage for Vecf16L2 {
|
||||
type Storage = vec::VecStorage<F16>;
|
||||
}
|
||||
|
||||
impl OperatorStorage for Vecf32Cos {
|
||||
type Storage = vec::VecStorage<F32>;
|
||||
}
|
||||
|
||||
impl OperatorStorage for Vecf32Dot {
|
||||
type Storage = vec::VecStorage<F32>;
|
||||
}
|
||||
|
||||
impl OperatorStorage for Vecf32L2 {
|
||||
type Storage = vec::VecStorage<F32>;
|
||||
}
|
||||
|
||||
impl OperatorStorage for BVecf32Cos {
|
||||
type Storage = bvector::BVectorStorage;
|
||||
}
|
||||
|
||||
impl OperatorStorage for BVecf32Dot {
|
||||
type Storage = bvector::BVectorStorage;
|
||||
}
|
||||
|
||||
impl OperatorStorage for BVecf32L2 {
|
||||
type Storage = bvector::BVectorStorage;
|
||||
}
|
||||
|
||||
impl OperatorStorage for BVecf32Jaccard {
|
||||
type Storage = bvector::BVectorStorage;
|
||||
}
|
||||
|
||||
impl OperatorStorage for Veci8Cos {
|
||||
type Storage = veci8::Veci8Storage;
|
||||
}
|
||||
|
||||
impl OperatorStorage for Veci8Dot {
|
||||
type Storage = veci8::Veci8Storage;
|
||||
}
|
||||
|
||||
impl OperatorStorage for Veci8L2 {
|
||||
type Storage = veci8::Veci8Storage;
|
||||
}
|
||||
|
@ -7,7 +7,7 @@ use base::operator::*;
|
||||
use base::scalar::*;
|
||||
|
||||
pub trait OperatorStorage: Operator {
|
||||
type Storage: Storage<VectorOwned = Self::VectorOwned>;
|
||||
type Storage: Storage<VectorOwned = Self::VectorOwned> + Send + Sync;
|
||||
}
|
||||
|
||||
impl OperatorStorage for SVecf32Cos {
|
||||
|
@ -1,84 +1,82 @@
|
||||
use crate::Storage;
|
||||
pub use base::index::*;
|
||||
use base::operator::Operator;
|
||||
pub use base::scalar::*;
|
||||
pub use base::search::*;
|
||||
pub use base::vector::*;
|
||||
use base::scalar::*;
|
||||
use base::search::*;
|
||||
use base::vector::*;
|
||||
use common::json::Json;
|
||||
use common::mmap_array::MmapArray;
|
||||
use std::path::Path;
|
||||
|
||||
pub struct SVecStorage {
|
||||
dims: Json<u32>,
|
||||
len: Json<u32>,
|
||||
indexes: MmapArray<u32>,
|
||||
values: MmapArray<F32>,
|
||||
offsets: MmapArray<usize>,
|
||||
payload: MmapArray<Payload>,
|
||||
dims: u32,
|
||||
}
|
||||
|
||||
impl Storage for SVecStorage {
|
||||
type VectorOwned = SVecf32Owned;
|
||||
|
||||
impl<O: Operator<VectorOwned = SVecf32Owned>> Vectors<O> for SVecStorage {
|
||||
fn dims(&self) -> u32 {
|
||||
self.dims
|
||||
*self.dims
|
||||
}
|
||||
|
||||
fn len(&self) -> u32 {
|
||||
self.payload.len() as u32
|
||||
*self.len
|
||||
}
|
||||
|
||||
fn vector(&self, i: u32) -> SVecf32Borrowed<'_> {
|
||||
let s = self.offsets[i as usize];
|
||||
let e = self.offsets[i as usize + 1];
|
||||
unsafe {
|
||||
SVecf32Borrowed::new_unchecked(self.dims, &self.indexes[s..e], &self.values[s..e])
|
||||
}
|
||||
}
|
||||
|
||||
fn payload(&self, i: u32) -> Payload {
|
||||
self.payload[i as usize]
|
||||
}
|
||||
|
||||
fn open(path: &Path, options: IndexOptions) -> Self
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
let indexes = MmapArray::open(&path.join("indexes"));
|
||||
let values = MmapArray::open(&path.join("values"));
|
||||
let offsets = MmapArray::open(&path.join("offsets"));
|
||||
let payload = MmapArray::open(&path.join("payload"));
|
||||
Self {
|
||||
indexes,
|
||||
values,
|
||||
offsets,
|
||||
payload,
|
||||
dims: options.vector.dims,
|
||||
}
|
||||
}
|
||||
|
||||
fn save<O: Operator<VectorOwned = SVecf32Owned>, C: Collection<O>>(
|
||||
path: &Path,
|
||||
collection: &C,
|
||||
) -> Self {
|
||||
let n = collection.len();
|
||||
let indexes_iter = (0..n).flat_map(|i| collection.vector(i).indexes().to_vec());
|
||||
let values_iter = (0..n).flat_map(|i| collection.vector(i).values().to_vec());
|
||||
let offsets_iter = std::iter::once(0)
|
||||
.chain((0..n).map(|i| collection.vector(i).len() as usize))
|
||||
.scan(0, |state, x| {
|
||||
*state += x;
|
||||
Some(*state)
|
||||
});
|
||||
let payload_iter = (0..n).map(|i| collection.payload(i));
|
||||
let indexes = MmapArray::create(&path.join("indexes"), indexes_iter);
|
||||
let values = MmapArray::create(&path.join("values"), values_iter);
|
||||
let offsets = MmapArray::create(&path.join("offsets"), offsets_iter);
|
||||
let payload = MmapArray::create(&path.join("payload"), payload_iter);
|
||||
Self {
|
||||
indexes,
|
||||
values,
|
||||
offsets,
|
||||
payload,
|
||||
dims: collection.dims(),
|
||||
SVecf32Borrowed::new_unchecked(*self.dims, &self.indexes[s..e], &self.values[s..e])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<O: Operator<VectorOwned = SVecf32Owned>> Storage<O> for SVecStorage {
|
||||
fn create(path: impl AsRef<Path>, vectors: &impl Vectors<O>) -> Self {
|
||||
std::fs::create_dir(path.as_ref()).unwrap();
|
||||
let dims = Json::create(path.as_ref().join("dims"), vectors.dims());
|
||||
let len = Json::create(path.as_ref().join("len"), vectors.len());
|
||||
let indexes = MmapArray::create(
|
||||
path.as_ref().join("indexes"),
|
||||
(0..*len).flat_map(|i| vectors.vector(i).indexes().to_vec()),
|
||||
);
|
||||
let values = MmapArray::create(
|
||||
path.as_ref().join("values"),
|
||||
(0..*len).flat_map(|i| vectors.vector(i).values().to_vec()),
|
||||
);
|
||||
let offsets = MmapArray::create(
|
||||
path.as_ref().join("offsets"),
|
||||
std::iter::once(0)
|
||||
.chain((0..*len).map(|i| vectors.vector(i).len() as usize))
|
||||
.scan(0, |state, x| {
|
||||
*state += x;
|
||||
Some(*state)
|
||||
}),
|
||||
);
|
||||
common::dir_ops::sync_dir(path);
|
||||
Self {
|
||||
dims,
|
||||
len,
|
||||
indexes,
|
||||
values,
|
||||
offsets,
|
||||
}
|
||||
}
|
||||
|
||||
fn open(path: impl AsRef<Path>) -> Self {
|
||||
let dims = Json::open(path.as_ref().join("dims"));
|
||||
let len = Json::open(path.as_ref().join("len"));
|
||||
let indexes = MmapArray::open(path.as_ref().join("indexes"));
|
||||
let values = MmapArray::open(path.as_ref().join("values"));
|
||||
let offsets = MmapArray::open(path.as_ref().join("offsets"));
|
||||
Self {
|
||||
dims,
|
||||
len,
|
||||
indexes,
|
||||
values,
|
||||
offsets,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,116 +1,88 @@
|
||||
use crate::Storage;
|
||||
pub use base::index::*;
|
||||
use base::operator::Operator;
|
||||
pub use base::scalar::*;
|
||||
pub use base::search::*;
|
||||
pub use base::vector::*;
|
||||
use base::scalar::*;
|
||||
use base::search::*;
|
||||
use base::vector::*;
|
||||
use common::json::Json;
|
||||
use common::mmap_array::MmapArray;
|
||||
use std::path::Path;
|
||||
|
||||
pub struct VecStorage<T> {
|
||||
vectors: MmapArray<T>,
|
||||
payload: MmapArray<Payload>,
|
||||
dims: u16,
|
||||
dims: Json<u32>,
|
||||
len: Json<u32>,
|
||||
slice: MmapArray<T>,
|
||||
}
|
||||
|
||||
impl Storage for VecStorage<F32> {
|
||||
type VectorOwned = Vecf32Owned;
|
||||
|
||||
impl<O: Operator<VectorOwned = Vecf32Owned>> Vectors<O> for VecStorage<F32> {
|
||||
fn dims(&self) -> u32 {
|
||||
self.dims as u32
|
||||
*self.dims
|
||||
}
|
||||
|
||||
fn len(&self) -> u32 {
|
||||
self.payload.len() as u32
|
||||
*self.len
|
||||
}
|
||||
|
||||
fn vector(&self, i: u32) -> Vecf32Borrowed<'_> {
|
||||
let s = i as usize * self.dims as usize;
|
||||
let e = (i + 1) as usize * self.dims as usize;
|
||||
Vecf32Borrowed::new(&self.vectors[s..e])
|
||||
}
|
||||
|
||||
fn payload(&self, i: u32) -> Payload {
|
||||
self.payload[i as usize]
|
||||
}
|
||||
|
||||
fn open(path: &Path, options: IndexOptions) -> Self
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
let vectors = MmapArray::open(&path.join("vectors"));
|
||||
let payload = MmapArray::open(&path.join("payload"));
|
||||
Self {
|
||||
vectors,
|
||||
payload,
|
||||
dims: options.vector.dims.try_into().unwrap(),
|
||||
}
|
||||
}
|
||||
|
||||
fn save<O: Operator<VectorOwned = Self::VectorOwned>, C: Collection<O>>(
|
||||
path: &Path,
|
||||
collection: &C,
|
||||
) -> Self {
|
||||
let n = collection.len();
|
||||
let vectors_iter = (0..n).flat_map(|i| collection.vector(i).to_vec());
|
||||
let payload_iter = (0..n).map(|i| collection.payload(i));
|
||||
let vectors = MmapArray::create(&path.join("vectors"), vectors_iter);
|
||||
let payload = MmapArray::create(&path.join("payload"), payload_iter);
|
||||
Self {
|
||||
vectors,
|
||||
payload,
|
||||
dims: collection.dims().try_into().unwrap(),
|
||||
}
|
||||
let s = i as usize * *self.dims as usize;
|
||||
let e = (i + 1) as usize * *self.dims as usize;
|
||||
Vecf32Borrowed::new(&self.slice[s..e])
|
||||
}
|
||||
}
|
||||
|
||||
impl Storage for VecStorage<F16> {
|
||||
type VectorOwned = Vecf16Owned;
|
||||
impl<O: Operator<VectorOwned = Vecf32Owned>> Storage<O> for VecStorage<F32> {
|
||||
fn create(path: impl AsRef<Path>, vectors: &impl Vectors<O>) -> Self {
|
||||
std::fs::create_dir(path.as_ref()).unwrap();
|
||||
let dims = Json::create(path.as_ref().join("dims"), vectors.dims());
|
||||
let len = Json::create(path.as_ref().join("len"), vectors.len());
|
||||
let slice = MmapArray::create(
|
||||
path.as_ref().join("slice"),
|
||||
(0..*len).flat_map(|i| vectors.vector(i).to_vec()),
|
||||
);
|
||||
common::dir_ops::sync_dir(path.as_ref());
|
||||
Self { dims, len, slice }
|
||||
}
|
||||
|
||||
fn open(path: impl AsRef<Path>) -> Self {
|
||||
let dims = Json::open(path.as_ref().join("dims"));
|
||||
let len = Json::open(path.as_ref().join("len"));
|
||||
let slice = MmapArray::open(path.as_ref().join("slice"));
|
||||
Self { dims, len, slice }
|
||||
}
|
||||
}
|
||||
|
||||
impl<O: Operator<VectorOwned = Vecf16Owned>> Vectors<O> for VecStorage<F16> {
|
||||
fn dims(&self) -> u32 {
|
||||
self.dims as u32
|
||||
*self.dims
|
||||
}
|
||||
|
||||
fn len(&self) -> u32 {
|
||||
self.payload.len() as u32
|
||||
*self.len
|
||||
}
|
||||
|
||||
fn vector(&self, i: u32) -> Vecf16Borrowed {
|
||||
let s = i as usize * self.dims as usize;
|
||||
let e = (i + 1) as usize * self.dims as usize;
|
||||
Vecf16Borrowed::new(&self.vectors[s..e])
|
||||
}
|
||||
|
||||
fn payload(&self, i: u32) -> Payload {
|
||||
self.payload[i as usize]
|
||||
}
|
||||
|
||||
fn open(path: &Path, options: IndexOptions) -> Self
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
let vectors = MmapArray::open(&path.join("vectors"));
|
||||
let payload = MmapArray::open(&path.join("payload"));
|
||||
Self {
|
||||
vectors,
|
||||
payload,
|
||||
dims: options.vector.dims.try_into().unwrap(),
|
||||
}
|
||||
}
|
||||
|
||||
fn save<O: Operator<VectorOwned = Self::VectorOwned>, C: Collection<O>>(
|
||||
path: &Path,
|
||||
collection: &C,
|
||||
) -> Self {
|
||||
let n = collection.len();
|
||||
let vectors_iter = (0..n).flat_map(|i| collection.vector(i).to_vec());
|
||||
let payload_iter = (0..n).map(|i| collection.payload(i));
|
||||
let vectors = MmapArray::create(&path.join("vectors"), vectors_iter);
|
||||
let payload = MmapArray::create(&path.join("payload"), payload_iter);
|
||||
Self {
|
||||
vectors,
|
||||
payload,
|
||||
dims: collection.dims().try_into().unwrap(),
|
||||
}
|
||||
let s = i as usize * *self.dims as usize;
|
||||
let e = (i + 1) as usize * *self.dims as usize;
|
||||
Vecf16Borrowed::new(&self.slice[s..e])
|
||||
}
|
||||
}
|
||||
|
||||
impl<O: Operator<VectorOwned = Vecf16Owned>> Storage<O> for VecStorage<F16> {
|
||||
fn create(path: impl AsRef<Path>, vectors: &impl Vectors<O>) -> Self {
|
||||
std::fs::create_dir(path.as_ref()).unwrap();
|
||||
let dims = Json::create(path.as_ref().join("dims"), vectors.dims());
|
||||
let len = Json::create(path.as_ref().join("len"), vectors.len());
|
||||
let slice = MmapArray::create(
|
||||
path.as_ref().join("slice"),
|
||||
(0..*len).flat_map(|i| vectors.vector(i).to_vec()),
|
||||
);
|
||||
common::dir_ops::sync_dir(path);
|
||||
Self { dims, len, slice }
|
||||
}
|
||||
|
||||
fn open(path: impl AsRef<Path>) -> Self {
|
||||
let dims = Json::open(path.as_ref().join("dims"));
|
||||
let len = Json::open(path.as_ref().join("len"));
|
||||
let slice = MmapArray::open(path.as_ref().join("slice"));
|
||||
Self { dims, len, slice }
|
||||
}
|
||||
}
|
||||
|
@ -1,40 +1,38 @@
|
||||
use crate::Storage;
|
||||
pub use base::index::*;
|
||||
use base::operator::Operator;
|
||||
pub use base::scalar::*;
|
||||
pub use base::search::*;
|
||||
pub use base::vector::*;
|
||||
use base::scalar::*;
|
||||
use base::search::*;
|
||||
use base::vector::*;
|
||||
use common::json::Json;
|
||||
use common::mmap_array::MmapArray;
|
||||
use std::path::Path;
|
||||
|
||||
pub struct Veci8Storage {
|
||||
vectors: MmapArray<I8>,
|
||||
dims: Json<u32>,
|
||||
len: Json<u32>,
|
||||
slice: MmapArray<I8>,
|
||||
alphas: MmapArray<F32>,
|
||||
offsets: MmapArray<F32>,
|
||||
sums: MmapArray<F32>,
|
||||
l2_norms: MmapArray<F32>,
|
||||
payload: MmapArray<Payload>,
|
||||
dims: u32,
|
||||
}
|
||||
|
||||
impl Storage for Veci8Storage {
|
||||
type VectorOwned = Veci8Owned;
|
||||
|
||||
impl<O: Operator<VectorOwned = Veci8Owned>> Vectors<O> for Veci8Storage {
|
||||
fn dims(&self) -> u32 {
|
||||
self.dims
|
||||
*self.dims
|
||||
}
|
||||
|
||||
fn len(&self) -> u32 {
|
||||
self.payload.len() as u32
|
||||
*self.len
|
||||
}
|
||||
|
||||
fn vector(&self, i: u32) -> Veci8Borrowed<'_> {
|
||||
let s = i as usize * self.dims as usize;
|
||||
let e = (i + 1) as usize * self.dims as usize;
|
||||
let s = i as usize * *self.dims as usize;
|
||||
let e = (i + 1) as usize * *self.dims as usize;
|
||||
unsafe {
|
||||
Veci8Borrowed::new_unchecked(
|
||||
self.dims,
|
||||
&self.vectors[s..e],
|
||||
*self.dims,
|
||||
&self.slice[s..e],
|
||||
self.alphas[i as usize],
|
||||
self.offsets[i as usize],
|
||||
self.sums[i as usize],
|
||||
@ -42,58 +40,61 @@ impl Storage for Veci8Storage {
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn payload(&self, i: u32) -> Payload {
|
||||
self.payload[i as usize]
|
||||
}
|
||||
|
||||
fn open(path: &Path, options: IndexOptions) -> Self
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
let vectors = MmapArray::open(&path.join("vectors"));
|
||||
let alphas = MmapArray::open(&path.join("alphas"));
|
||||
let offsets = MmapArray::open(&path.join("offsets"));
|
||||
let sums = MmapArray::open(&path.join("sums"));
|
||||
let l2_norms = MmapArray::open(&path.join("l2_norms"));
|
||||
let payload = MmapArray::open(&path.join("payload"));
|
||||
impl<O: Operator<VectorOwned = Veci8Owned>> Storage<O> for Veci8Storage {
|
||||
fn create(path: impl AsRef<Path>, vectors: &impl Vectors<O>) -> Self {
|
||||
std::fs::create_dir(path.as_ref()).unwrap();
|
||||
let dims = Json::create(path.as_ref().join("dims"), vectors.dims());
|
||||
let len = Json::create(path.as_ref().join("len"), vectors.len());
|
||||
let slice = MmapArray::create(
|
||||
path.as_ref().join("slice"),
|
||||
(0..*len).flat_map(|i| vectors.vector(i).data().to_vec()),
|
||||
);
|
||||
let alphas = MmapArray::create(
|
||||
path.as_ref().join("alphas"),
|
||||
(0..*len).map(|i| vectors.vector(i).alpha()),
|
||||
);
|
||||
let offsets = MmapArray::create(
|
||||
path.as_ref().join("offsets"),
|
||||
(0..*len).map(|i| vectors.vector(i).offset()),
|
||||
);
|
||||
let sums = MmapArray::create(
|
||||
path.as_ref().join("sums"),
|
||||
(0..*len).map(|i| vectors.vector(i).sum()),
|
||||
);
|
||||
let l2_norms = MmapArray::create(
|
||||
path.as_ref().join("l2_norms"),
|
||||
(0..*len).map(|i| vectors.vector(i).l2_norm()),
|
||||
);
|
||||
common::dir_ops::sync_dir(path);
|
||||
Self {
|
||||
vectors,
|
||||
dims,
|
||||
len,
|
||||
slice,
|
||||
alphas,
|
||||
offsets,
|
||||
sums,
|
||||
l2_norms,
|
||||
payload,
|
||||
dims: options.vector.dims,
|
||||
}
|
||||
}
|
||||
|
||||
fn save<O: Operator<VectorOwned = Veci8Owned>, C: Collection<O>>(
|
||||
path: &Path,
|
||||
collection: &C,
|
||||
) -> Self {
|
||||
let n = collection.len();
|
||||
// TODO: how to avoid clone here?
|
||||
let vectors_iter = (0..n).flat_map(|i| collection.vector(i).data().to_vec());
|
||||
let alphas_iter = (0..n).map(|i| collection.vector(i).alpha());
|
||||
let offsets_iter = (0..n).map(|i| collection.vector(i).offset());
|
||||
let sums_iter = (0..n).map(|i| collection.vector(i).sum());
|
||||
let l2_norms_iter = (0..n).map(|i| collection.vector(i).l2_norm());
|
||||
let payload_iter = (0..n).map(|i| collection.payload(i));
|
||||
let vectors = MmapArray::create(&path.join("vectors"), vectors_iter);
|
||||
let alphas = MmapArray::create(&path.join("alphas"), alphas_iter);
|
||||
let offsets = MmapArray::create(&path.join("offsets"), offsets_iter);
|
||||
let sums = MmapArray::create(&path.join("sums"), sums_iter);
|
||||
let l2_norms = MmapArray::create(&path.join("l2_norms"), l2_norms_iter);
|
||||
let payload = MmapArray::create(&path.join("payload"), payload_iter);
|
||||
fn open(path: impl AsRef<Path>) -> Self {
|
||||
let dims = Json::open(path.as_ref().join("dims"));
|
||||
let len = Json::open(path.as_ref().join("len"));
|
||||
let slice = MmapArray::open(path.as_ref().join("slice"));
|
||||
let alphas = MmapArray::open(path.as_ref().join("alphas"));
|
||||
let offsets = MmapArray::open(path.as_ref().join("offsets"));
|
||||
let sums = MmapArray::open(path.as_ref().join("sums"));
|
||||
let l2_norms = MmapArray::open(path.as_ref().join("l2_norms"));
|
||||
Self {
|
||||
vectors,
|
||||
dims,
|
||||
len,
|
||||
slice,
|
||||
alphas,
|
||||
offsets,
|
||||
sums,
|
||||
l2_norms,
|
||||
payload,
|
||||
dims: collection.dims(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -30,26 +30,19 @@ pub fn is_started() -> bool {
|
||||
#[pgrx::pg_guard]
|
||||
#[no_mangle]
|
||||
extern "C" fn _vectors_main(_arg: pgrx::pg_sys::Datum) {
|
||||
pub struct AllocErrorPanicPayload {
|
||||
pub layout: std::alloc::Layout,
|
||||
}
|
||||
{
|
||||
let mut builder = crate::logger::VectorLogger::build();
|
||||
#[cfg(not(debug_assertions))]
|
||||
{
|
||||
builder.filter_level(log::LevelFilter::Info);
|
||||
}
|
||||
#[cfg(debug_assertions)]
|
||||
// #[cfg(debug_assertions)]
|
||||
{
|
||||
builder.filter_level(log::LevelFilter::Trace);
|
||||
}
|
||||
builder.init();
|
||||
}
|
||||
std::panic::set_hook(Box::new(|info| {
|
||||
if let Some(oom) = info.payload().downcast_ref::<AllocErrorPanicPayload>() {
|
||||
log::error!("Out of memory. Layout: {:?}.", oom.layout);
|
||||
return;
|
||||
}
|
||||
let backtrace;
|
||||
#[cfg(not(debug_assertions))]
|
||||
{
|
||||
@ -61,9 +54,6 @@ extern "C" fn _vectors_main(_arg: pgrx::pg_sys::Datum) {
|
||||
}
|
||||
log::error!("Panickied. Info: {:?}. Backtrace: {}.", info, backtrace);
|
||||
}));
|
||||
std::alloc::set_alloc_error_hook(|layout| {
|
||||
std::panic::panic_any(AllocErrorPanicPayload { layout });
|
||||
});
|
||||
use service::Version;
|
||||
use service::Worker;
|
||||
use std::path::Path;
|
||||
|
@ -324,7 +324,7 @@ mod tests {
|
||||
let indexes_20: Vec<u32> = vec![
|
||||
1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
];
|
||||
let values_20: Vec<F32> = vec![
|
||||
let values_20: Vec<F32> = [
|
||||
1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
]
|
||||
.iter()
|
||||
@ -343,7 +343,7 @@ mod tests {
|
||||
values: values_20.clone(),
|
||||
};
|
||||
let sindexes = vec![0, 2, 4, 6, 8, 10, 12, 14, 16, 18];
|
||||
let svalues: Vec<F32> = vec![1, 2, 4, 6, 8, 10, 12, 14, 16, 18]
|
||||
let svalues: Vec<F32> = [1, 2, 4, 6, 8, 10, 12, 14, 16, 18]
|
||||
.iter()
|
||||
.map(|&x| F32(x as f32))
|
||||
.collect();
|
||||
@ -356,7 +356,7 @@ mod tests {
|
||||
);
|
||||
assert_eq!(
|
||||
state.values(),
|
||||
vec![1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
|
||||
[1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
|
||||
.iter()
|
||||
.map(|&x| F32(x as f32))
|
||||
.collect::<Vec<F32>>()
|
||||
@ -378,14 +378,14 @@ mod tests {
|
||||
assert_eq!(result_len, 16);
|
||||
assert_eq!(
|
||||
state.indexes(),
|
||||
vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 18]
|
||||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 18]
|
||||
.iter()
|
||||
.map(|&x| x as u32)
|
||||
.collect::<Vec<u32>>()
|
||||
);
|
||||
assert_eq!(
|
||||
state.values(),
|
||||
vec![1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 18]
|
||||
[1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 18]
|
||||
.iter()
|
||||
.map(|&x| F32(x as f32))
|
||||
.collect::<Vec<F32>>()
|
||||
@ -402,7 +402,7 @@ mod tests {
|
||||
values: values_20.clone(),
|
||||
};
|
||||
let sindexes = vec![0, 3, 6, 9, 12, 15, 18];
|
||||
let svalues: Vec<F32> = vec![1, 1, 1, 1, 1, 1, 1]
|
||||
let svalues: Vec<F32> = [1, 1, 1, 1, 1, 1, 1]
|
||||
.iter()
|
||||
.map(|&x| F32(x as f32))
|
||||
.collect();
|
||||
@ -412,14 +412,14 @@ mod tests {
|
||||
assert_eq!(result_len, 14);
|
||||
assert_eq!(
|
||||
state.indexes(),
|
||||
vec![0, 1, 3, 5, 6, 7, 9, 11, 12, 13, 15, 17, 18, 19]
|
||||
[0, 1, 3, 5, 6, 7, 9, 11, 12, 13, 15, 17, 18, 19]
|
||||
.iter()
|
||||
.map(|&x| x as u32)
|
||||
.collect::<Vec<u32>>()
|
||||
);
|
||||
assert_eq!(
|
||||
state.values(),
|
||||
vec![1, 1, 4, 5, 1, 7, 10, 11, 1, 13, 16, 17, 1, 19]
|
||||
[1, 1, 4, 5, 1, 7, 10, 11, 1, 13, 16, 17, 1, 19]
|
||||
.iter()
|
||||
.map(|&x| F32(x as f32))
|
||||
.collect::<Vec<F32>>()
|
||||
|
@ -5,6 +5,8 @@ static IVF_NPROBE: GucSetting<i32> = GucSetting::<i32>::new(10);
|
||||
|
||||
static HNSW_EF_SEARCH: GucSetting<i32> = GucSetting::<i32>::new(100);
|
||||
|
||||
static DISKANN_EF_SEARCH: GucSetting<i32> = GucSetting::<i32>::new(100);
|
||||
|
||||
pub unsafe fn init() {
|
||||
GucRegistry::define_int_guc(
|
||||
"vectors.ivf_nprobe",
|
||||
@ -12,7 +14,7 @@ pub unsafe fn init() {
|
||||
"https://docs.pgvecto.rs/usage/search.html",
|
||||
&IVF_NPROBE,
|
||||
1,
|
||||
1_000_000,
|
||||
u16::MAX as _,
|
||||
GucContext::Userset,
|
||||
GucFlags::default(),
|
||||
);
|
||||
@ -26,11 +28,22 @@ pub unsafe fn init() {
|
||||
GucContext::Userset,
|
||||
GucFlags::default(),
|
||||
);
|
||||
GucRegistry::define_int_guc(
|
||||
"vectors.diskann_ef_search",
|
||||
"`ef_search` argument of DiskANN algorithm.",
|
||||
"https://docs.pgvecto.rs/usage/search.html",
|
||||
&DISKANN_EF_SEARCH,
|
||||
1,
|
||||
u16::MAX as _,
|
||||
GucContext::Userset,
|
||||
GucFlags::default(),
|
||||
);
|
||||
}
|
||||
|
||||
pub fn search_options() -> SearchOptions {
|
||||
SearchOptions {
|
||||
hnsw_ef_search: HNSW_EF_SEARCH.get() as u32,
|
||||
diskann_ef_search: DISKANN_EF_SEARCH.get() as u32,
|
||||
ivf_nprobe: IVF_NPROBE.get() as u32,
|
||||
}
|
||||
}
|
||||
|
@ -87,16 +87,14 @@ fn convert_name_to_vd(name: &str) -> Option<(VectorKind, DistanceKind)> {
|
||||
|
||||
unsafe fn convert_reloptions_to_options(
|
||||
reloptions: *const pgrx::pg_sys::varlena,
|
||||
) -> (SegmentsOptions, OptimizingOptions, IndexingOptions) {
|
||||
) -> (IndexingOptions, IndexAlterableOptions) {
|
||||
#[derive(Debug, Clone, Deserialize, Default)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
struct Parsed {
|
||||
#[serde(default)]
|
||||
segment: SegmentsOptions,
|
||||
#[serde(default)]
|
||||
optimizing: OptimizingOptions,
|
||||
#[serde(default)]
|
||||
indexing: IndexingOptions,
|
||||
#[serde(flatten)]
|
||||
alterable: IndexAlterableOptions,
|
||||
}
|
||||
let reloption = reloptions as *const Reloption;
|
||||
if reloption.is_null() || unsafe { (*reloption).options == 0 } {
|
||||
@ -104,7 +102,7 @@ unsafe fn convert_reloptions_to_options(
|
||||
}
|
||||
let s = unsafe { (*reloption).options() }.to_string_lossy();
|
||||
match toml::from_str::<Parsed>(&s) {
|
||||
Ok(p) => (p.segment, p.optimizing, p.indexing),
|
||||
Ok(p) => (p.indexing, p.alterable),
|
||||
Err(e) => pgrx::error!("failed to parse options: {}", e),
|
||||
}
|
||||
}
|
||||
@ -124,15 +122,8 @@ pub unsafe fn options(index: pgrx::pg_sys::Relation) -> (IndexOptions, IndexAlte
|
||||
let dims = check_column_dims(typmod.dims()).get();
|
||||
// get v, d
|
||||
let (v, d) = convert_opfamily_to_vd(opfamily).unwrap();
|
||||
// get segment, optimizing, indexing
|
||||
let (segment, optimizing, indexing) =
|
||||
unsafe { convert_reloptions_to_options((*index).rd_options) };
|
||||
(
|
||||
IndexOptions {
|
||||
vector: VectorOptions { dims, v, d },
|
||||
segment,
|
||||
indexing,
|
||||
},
|
||||
IndexAlterableOptions { optimizing },
|
||||
)
|
||||
let vector = VectorOptions { dims, v, d };
|
||||
// get indexing, segment, optimizing
|
||||
let (indexing, alterable) = unsafe { convert_reloptions_to_options((*index).rd_options) };
|
||||
(IndexOptions { vector, indexing }, alterable)
|
||||
}
|
||||
|
@ -76,37 +76,34 @@ pub unsafe fn on_object_access(
|
||||
if sub_id != 0 {
|
||||
return;
|
||||
}
|
||||
match access {
|
||||
pgrx::pg_sys::ObjectAccessType_OAT_DROP => {
|
||||
let search = pgrx::pg_catalog::PgClass::search_reloid(object_id).unwrap();
|
||||
if let Some(pg_class) = search.get() {
|
||||
if let Some(()) = check_vector_index(pg_class) {
|
||||
let handle = from_oid_to_handle(object_id);
|
||||
let mut t = TRANSACTION.borrow_mut();
|
||||
match t.index.get(&handle) {
|
||||
Some(TransactionIndex::Create) => {
|
||||
// It's created in this transaction, so drop it immediately
|
||||
let handle = from_oid_to_handle(object_id);
|
||||
let mut rpc = check_client(client());
|
||||
if let Err(e) = rpc.drop(handle) {
|
||||
pgrx::warning!("Failed to drop {handle} for abortting: {e}.");
|
||||
}
|
||||
t.index.remove(&handle);
|
||||
}
|
||||
Some(TransactionIndex::Drop) => unreachable!(),
|
||||
Some(TransactionIndex::Dirty) => {
|
||||
// It's not created in this transaction but modified in this transaction
|
||||
t.index.insert(handle, TransactionIndex::Drop);
|
||||
}
|
||||
None => {
|
||||
// It's not created in this transaction and never modified in this transaction
|
||||
t.index.insert(handle, TransactionIndex::Drop);
|
||||
if access == pgrx::pg_sys::ObjectAccessType_OAT_DROP {
|
||||
let search = pgrx::pg_catalog::PgClass::search_reloid(object_id).unwrap();
|
||||
if let Some(pg_class) = search.get() {
|
||||
if let Some(()) = check_vector_index(pg_class) {
|
||||
let handle = from_oid_to_handle(object_id);
|
||||
let mut t = TRANSACTION.borrow_mut();
|
||||
match t.index.get(&handle) {
|
||||
Some(TransactionIndex::Create) => {
|
||||
// It's created in this transaction, so drop it immediately
|
||||
let handle = from_oid_to_handle(object_id);
|
||||
let mut rpc = check_client(client());
|
||||
if let Err(e) = rpc.drop(handle) {
|
||||
pgrx::warning!("Failed to drop {handle} for abortting: {e}.");
|
||||
}
|
||||
t.index.remove(&handle);
|
||||
}
|
||||
Some(TransactionIndex::Drop) => unreachable!(),
|
||||
Some(TransactionIndex::Dirty) => {
|
||||
// It's not created in this transaction but modified in this transaction
|
||||
t.index.insert(handle, TransactionIndex::Drop);
|
||||
}
|
||||
None => {
|
||||
// It's not created in this transaction and never modified in this transaction
|
||||
t.index.insert(handle, TransactionIndex::Drop);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -96,13 +96,13 @@ unsafe fn rewrite_type_options(istmt: *mut pgrx::pg_sys::IndexStmt, method: &str
|
||||
swap_destroy(&mut (*istmt).options, list_from_vec(vec![elem]));
|
||||
}
|
||||
"ivfflat" => {
|
||||
let nlist = opts
|
||||
let list = opts
|
||||
.get("list")
|
||||
.unwrap_or(&String::from("100"))
|
||||
.parse::<u32>()
|
||||
.unwrap();
|
||||
let arg = pgrx::pg_sys::makeString(
|
||||
format!("[indexing.ivf]\nnlist = {}", nlist).as_pg_cstr(),
|
||||
format!("[indexing.ivf]\nnlist = {}", list).as_pg_cstr(),
|
||||
);
|
||||
let elem = pgrx::pg_sys::makeDefElem("options".as_pg_cstr(), arg as _, -1);
|
||||
swap_destroy(&mut (*istmt).options, list_from_vec(vec![elem]));
|
||||
|
@ -1,10 +1,7 @@
|
||||
//! Postgres vector extension.
|
||||
//!
|
||||
//! Provides an easy-to-use extension for vector similarity search.
|
||||
#![feature(alloc_error_hook)]
|
||||
#![feature(slice_split_once)]
|
||||
#![allow(clippy::needless_range_loop)]
|
||||
#![allow(clippy::single_match)]
|
||||
#![allow(clippy::too_many_arguments)]
|
||||
|
||||
mod bgworker;
|
||||
@ -40,6 +37,9 @@ unsafe extern "C" fn _PG_init() {
|
||||
#[cfg(not(all(target_endian = "little", target_pointer_width = "64")))]
|
||||
compile_error!("Target is not supported.");
|
||||
|
||||
#[cfg(not(any(feature = "pg14", feature = "pg15", feature = "pg16")))]
|
||||
compiler_error!("PostgreSQL version must be selected.");
|
||||
|
||||
#[global_allocator]
|
||||
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
|
||||
|
||||
|
@ -8,7 +8,7 @@ statement ok
|
||||
CREATE TABLE t (val vector(3));
|
||||
|
||||
statement ok
|
||||
INSERT INTO t (val) SELECT ARRAY[random(), random(), random()]::real[] FROM generate_series(1, 1000);
|
||||
INSERT INTO t (val) SELECT ARRAY[random(), random(), random()]::real[] FROM generate_series(1, 10000);
|
||||
|
||||
# HNSW compatible Test
|
||||
statement ok
|
||||
@ -46,7 +46,7 @@ DROP INDEX hnsw_cosine_index;
|
||||
|
||||
# IVF compatible Test
|
||||
statement ok
|
||||
CREATE INDEX ivf_l2_index ON t USING ivfflat (val vector_l2_ops) WITH (lists = 100);
|
||||
CREATE INDEX ivf_l2_index ON t USING ivfflat (val vector_l2_ops) WITH (lists = 20);
|
||||
|
||||
query I
|
||||
SELECT COUNT(1) FROM (SELECT 1 FROM t ORDER BY val <-> '[0.5,0.5,0.5]' limit 10) t2;
|
||||
@ -57,7 +57,7 @@ statement ok
|
||||
DROP INDEX ivf_l2_index;
|
||||
|
||||
statement ok
|
||||
CREATE INDEX ivf_ip_index ON t USING ivfflat (val vector_ip_ops);
|
||||
CREATE INDEX ivf_ip_index ON t USING ivfflat (val vector_ip_ops) WITH (lists = 20);
|
||||
|
||||
query I
|
||||
SELECT COUNT(1) FROM (SELECT 1 FROM t ORDER BY val <#> '[0.5,0.5,0.5]' limit 10) t2;
|
||||
@ -68,7 +68,7 @@ statement ok
|
||||
DROP INDEX ivf_ip_index;
|
||||
|
||||
statement ok
|
||||
CREATE INDEX ivf_cosine_index ON t USING ivfflat (val vector_cosine_ops);
|
||||
CREATE INDEX ivf_cosine_index ON t USING ivfflat (val vector_cosine_ops) WITH (lists = 20);
|
||||
|
||||
query I
|
||||
SELECT COUNT(1) FROM (SELECT 1 FROM t ORDER BY val <=> '[0.5,0.5,0.5]' limit 10) t2;
|
||||
|
@ -13,7 +13,7 @@ statement ok
|
||||
CREATE INDEX ON t USING vectors (val vector_l2_ops)
|
||||
WITH (options = $$
|
||||
[indexing.ivf]
|
||||
nlist = 100
|
||||
nlist = 20
|
||||
$$);
|
||||
|
||||
statement ok
|
||||
|
Loading…
x
Reference in New Issue
Block a user