1
0
mirror of https://github.com/tensorchord/pgvecto.rs.git synced 2025-04-18 21:44:00 +03:00

feat: pyo3 bindings of indexing (#565)

* feat: pyo3 bindings of indexing

Signed-off-by: usamoi <usamoi@outlook.com>

* fix: ci

Signed-off-by: usamoi <usamoi@outlook.com>

---------

Signed-off-by: usamoi <usamoi@outlook.com>
This commit is contained in:
usamoi 2024-08-20 10:01:07 +08:00 committed by GitHub
parent 20e84ca9a2
commit ab1edc9777
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
43 changed files with 971 additions and 368 deletions

View File

@ -42,16 +42,19 @@ env:
SCCACHE_GHA_ENABLED: true
RUSTC_WRAPPER: sccache
RUSTFLAGS: "-Dwarnings"
CARGO_PROFILE_OPT_BUILD_OVERRIDE_DEBUG: true
jobs:
test:
strategy:
matrix:
version: [14, 15, 16]
arch: ["x86_64"]
runs-on: ubuntu-latest
env:
SEMVER: "0.0.0"
VERSION: ${{ matrix.version }}
ARCH: ${{ matrix.arch }}
steps:
- name: Checkout
uses: actions/checkout@v4
@ -81,41 +84,41 @@ jobs:
sudo apt-get update
sudo apt-get install -y clang-16
sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-16 128
- name: Set up Pgrx
run: |
# pg_config
mkdir -p ~/.pg_config
touch ~/.pg_config/pg_config
chmod 777 ~/.pg_config/pg_config
echo "#!/usr/bin/env bash" >> ~/.pg_config/pg_config
echo "$(pwd)/tools/pg_config.sh \"\$@\" < $(pwd)/vendor/pg_config/pg${VERSION}_$(uname --machine)-unknown-linux-gnu.txt" >> ~/.pg_config/pg_config
mkdir -p ~/.pgrx && echo "configs.pg$VERSION=\"$HOME/.pg_config/pg_config\"" > ~/.pgrx/config.toml
# pgrx_binding
mkdir -p ~/.pgrx_binding
cp ./vendor/pgrx_binding/pg${VERSION}_$(uname --machine)-unknown-linux-gnu.rs ~/.pgrx_binding/pg${VERSION}_raw_bindings.rs
echo PGRX_TARGET_INFO_PATH_PG$VERSION=$HOME/.pgrx_binding >> "$GITHUB_ENV"
- name: Build Release
run: |
cargo build --lib --features "pg$VERSION" --profile opt
./tools/schema.sh --features "pg$VERSION" --profile opt | expand -t 4 > ./target/vectors--$SEMVER.sql
- name: Set up PostgreSQL
run: |
mkdir -p ~/.pg14/pg_binding && touch ~/.pg14/pg_config && chmod 777 ~/.pg14/pg_config
mkdir -p ~/.pg15/pg_binding && touch ~/.pg15/pg_config && chmod 777 ~/.pg15/pg_config
mkdir -p ~/.pg16/pg_binding && touch ~/.pg16/pg_config && chmod 777 ~/.pg16/pg_config
cp ./vendor/pgrx_binding/pg14_${ARCH}-unknown-linux-gnu.rs ~/.pg14/pg_binding/pg14_raw_bindings.rs
cp ./vendor/pgrx_binding/pg15_${ARCH}-unknown-linux-gnu.rs ~/.pg15/pg_binding/pg15_raw_bindings.rs
cp ./vendor/pgrx_binding/pg16_${ARCH}-unknown-linux-gnu.rs ~/.pg16/pg_binding/pg16_raw_bindings.rs
echo "#!/usr/bin/env bash" >> ~/.pg14/pg_config
echo "$(pwd)/tools/pg_config.sh \"\$@\" < $(pwd)/vendor/pg_config/pg14_${ARCH}-unknown-linux-gnu.txt" >> ~/.pg14/pg_config
echo "#!/usr/bin/env bash" >> ~/.pg15/pg_config
echo "$(pwd)/tools/pg_config.sh \"\$@\" < $(pwd)/vendor/pg_config/pg15_${ARCH}-unknown-linux-gnu.txt" >> ~/.pg15/pg_config
echo "#!/usr/bin/env bash" >> ~/.pg16/pg_config
echo "$(pwd)/tools/pg_config.sh \"\$@\" < $(pwd)/vendor/pg_config/pg16_${ARCH}-unknown-linux-gnu.txt" >> ~/.pg16/pg_config
- name: Build
run: |
export PGRX_PG_CONFIG_PATH=$HOME/.pg$VERSION/pg_config
export PGRX_TARGET_INFO_PATH_PG$VERSION=$HOME/.pg$VERSION/pg_binding
cargo build --package pgvectors --lib --features pg$VERSION --target $ARCH-unknown-linux-gnu --profile opt
./tools/schema.sh --features pg$VERSION --target $ARCH-unknown-linux-gnu --profile opt | expand -t 4 > ./target/schema.sql
- name: Install
run: |
sudo sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" >> /etc/apt/sources.list.d/pgdg.list'
wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add -
sudo apt-get update
sudo apt-get -y install postgresql-$VERSION
echo "local all all trust" | sudo tee /etc/postgresql/$VERSION/main/pg_hba.conf
echo "host all all 127.0.0.1/32 trust" | sudo tee -a /etc/postgresql/$VERSION/main/pg_hba.conf
echo "host all all ::1/128 trust" | sudo tee -a /etc/postgresql/$VERSION/main/pg_hba.conf
sudo systemctl restart postgresql
sudo -iu postgres createuser -s -r $USER
createdb
- name: Install Release
run: |
sudo cp ./target/vectors--$SEMVER.sql /usr/share/postgresql/$VERSION/extension/vectors--$SEMVER.sql
sudo cp ./target/opt/libvectors.so "/usr/lib/postgresql/$VERSION/lib/vectors.so"
sudo cp ./target/schema.sql /usr/share/postgresql/$VERSION/extension/vectors--$SEMVER.sql
sudo cp ./target/$ARCH-unknown-linux-gnu/opt/libvectors.so "/usr/lib/postgresql/$VERSION/lib/vectors.so"
sed -e "s/@CARGO_VERSION@/$SEMVER/g" < ./vectors.control | sudo tee "/usr/share/postgresql/$VERSION/extension/vectors.control"
psql -c 'ALTER SYSTEM SET shared_preload_libraries = "vectors.so"'

View File

@ -78,23 +78,26 @@ jobs:
sudo apt-get update
sudo apt-get install -y clang-16
sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-16 128
- name: Set up Pgrx
- name: Set up PostgreSQL
run: |
# pg_config
mkdir -p ~/.pg_config
touch ~/.pg_config/pg_config
chmod 777 ~/.pg_config/pg_config
echo "#!/usr/bin/env bash" >> ~/.pg_config/pg_config
echo "$(pwd)/tools/pg_config.sh \"\$@\" < $(pwd)/vendor/pg_config/pg${VERSION}_${ARCH}-unknown-linux-gnu.txt" >> ~/.pg_config/pg_config
mkdir -p ~/.pgrx && echo "configs.pg$VERSION=\"$HOME/.pg_config/pg_config\"" > ~/.pgrx/config.toml
# pgrx_binding
mkdir -p ~/.pgrx_binding
cp ./vendor/pgrx_binding/pg${VERSION}_$(uname --machine)-unknown-linux-gnu.rs ~/.pgrx_binding/pg${VERSION}_raw_bindings.rs
echo PGRX_TARGET_INFO_PATH_PG$VERSION=$HOME/.pgrx_binding >> "$GITHUB_ENV"
mkdir -p ~/.pg14/pg_binding && touch ~/.pg14/pg_config && chmod 777 ~/.pg14/pg_config
mkdir -p ~/.pg15/pg_binding && touch ~/.pg15/pg_config && chmod 777 ~/.pg15/pg_config
mkdir -p ~/.pg16/pg_binding && touch ~/.pg16/pg_config && chmod 777 ~/.pg16/pg_config
cp ./vendor/pgrx_binding/pg14_${ARCH}-unknown-linux-gnu.rs ~/.pg14/pg_binding/pg14_raw_bindings.rs
cp ./vendor/pgrx_binding/pg15_${ARCH}-unknown-linux-gnu.rs ~/.pg15/pg_binding/pg15_raw_bindings.rs
cp ./vendor/pgrx_binding/pg16_${ARCH}-unknown-linux-gnu.rs ~/.pg16/pg_binding/pg16_raw_bindings.rs
echo "#!/usr/bin/env bash" >> ~/.pg14/pg_config
echo "$(pwd)/tools/pg_config.sh \"\$@\" < $(pwd)/vendor/pg_config/pg14_${ARCH}-unknown-linux-gnu.txt" >> ~/.pg14/pg_config
echo "#!/usr/bin/env bash" >> ~/.pg15/pg_config
echo "$(pwd)/tools/pg_config.sh \"\$@\" < $(pwd)/vendor/pg_config/pg15_${ARCH}-unknown-linux-gnu.txt" >> ~/.pg15/pg_config
echo "#!/usr/bin/env bash" >> ~/.pg16/pg_config
echo "$(pwd)/tools/pg_config.sh \"\$@\" < $(pwd)/vendor/pg_config/pg16_${ARCH}-unknown-linux-gnu.txt" >> ~/.pg16/pg_config
- name: Build
run: |
cargo build --lib --features pg$VERSION --release --target $ARCH-unknown-linux-gnu
./tools/schema.sh --features pg$VERSION --release --target $ARCH-unknown-linux-gnu | expand -t 4 > ./target/vectors--$SEMVER.sql
export PGRX_PG_CONFIG_PATH=$HOME/.pg$VERSION/pg_config
export PGRX_TARGET_INFO_PATH_PG$VERSION=$HOME/.pg$VERSION/pg_binding
cargo build --package pgvectors --lib --features pg$VERSION --target $ARCH-unknown-linux-gnu --release
./tools/schema.sh --features pg$VERSION --target $ARCH-unknown-linux-gnu --release | expand -t 4 > ./target/schema.sql
- name: Package
run: |
export PLATFORM=$(echo $ARCH | sed 's/aarch64/arm64/; s/x86_64/amd64/')

View File

@ -42,83 +42,15 @@ env:
SCCACHE_GHA_ENABLED: true
RUSTC_WRAPPER: sccache
RUSTFLAGS: "-Dwarnings"
CARGO_PROFILE_OPT_BUILD_OVERRIDE_DEBUG: true
jobs:
check:
strategy:
matrix:
version: [14, 15, 16]
arch: ["x86_64", "aarch64"]
runs-on: ubuntu-latest
env:
SEMVER: "0.0.0"
VERSION: ${{ matrix.version }}
ARCH: ${{ matrix.arch }}
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Set up Environment
run: |
sudo apt-get remove -y '^postgres.*' '^libpq.*' '^clang.*' '^llvm.*' '^libclang.*' '^libllvm.*' '^mono-llvm.*'
sudo apt-get purge -y '^postgres.*' '^libpq.*' '^clang.*' '^llvm.*' '^libclang.*' '^libllvm.*' '^mono-llvm.*'
sudo apt-get update
sudo apt-get install -y build-essential crossbuild-essential-arm64
sudo apt-get install -y qemu-user-static
touch ~/.cargo/config.toml
echo 'target.aarch64-unknown-linux-gnu.linker = "aarch64-linux-gnu-gcc"' >> ~/.cargo/config.toml
echo 'target.aarch64-unknown-linux-gnu.runner = ["qemu-aarch64-static", "-L", "/usr/aarch64-linux-gnu"]' >> ~/.cargo/config.toml
- name: Set up Sccache
uses: mozilla-actions/sccache-action@v0.0.4
- name: Set up Cache
uses: actions/cache/restore@v4
id: cache
with:
path: |
~/.cargo/registry/index/
~/.cargo/registry/cache/
~/.cargo/git/db/
key: ${{ github.job }}-${{ hashFiles('./Cargo.lock') }}-${{ matrix.version }}-${{ matrix.arch }}
- name: Set up Clang-16
run: |
sudo sh -c 'echo "deb http://apt.llvm.org/$(lsb_release -cs)/ llvm-toolchain-$(lsb_release -cs)-16 main" >> /etc/apt/sources.list'
wget --quiet -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
sudo apt-get update
sudo apt-get install -y clang-16
sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-16 128
- name: Set up Pgrx
run: |
# pg_config
mkdir -p ~/.pg_config
touch ~/.pg_config/pg_config
chmod 777 ~/.pg_config/pg_config
echo "#!/usr/bin/env bash" >> ~/.pg_config/pg_config
echo "$(pwd)/tools/pg_config.sh \"\$@\" < $(pwd)/vendor/pg_config/pg${VERSION}_${ARCH}-unknown-linux-gnu.txt" >> ~/.pg_config/pg_config
mkdir -p ~/.pgrx && echo "configs.pg$VERSION=\"$HOME/.pg_config/pg_config\"" > ~/.pgrx/config.toml
# pgrx_binding
mkdir -p ~/.pgrx_binding
cp ./vendor/pgrx_binding/pg${VERSION}_$(uname --machine)-unknown-linux-gnu.rs ~/.pgrx_binding/pg${VERSION}_raw_bindings.rs
echo PGRX_TARGET_INFO_PATH_PG$VERSION=$HOME/.pgrx_binding >> "$GITHUB_ENV"
- name: Clippy
run: cargo clippy --features "pg$VERSION" --target $ARCH-unknown-linux-gnu
- name: Build
run: cargo build --lib --features "pg$VERSION" --target $ARCH-unknown-linux-gnu
- name: Post Set up Cache
uses: actions/cache/save@v4
if: ${{ !steps.cache.outputs.cache-hit }}
with:
path: |
~/.cargo/registry/index/
~/.cargo/registry/cache/
~/.cargo/git/db/
key: ${{ github.job }}-${{ hashFiles('./Cargo.lock') }}-${{ matrix.version }}-${{ matrix.arch }}
test:
check_and_test:
strategy:
matrix:
arch: ["x86_64", "aarch64"]
runs-on: ubuntu-latest
env:
SEMVER: "0.0.0"
VERSION: "16"
ARCH: ${{ matrix.arch }}
steps:
- name: Checkout
@ -151,28 +83,54 @@ jobs:
sudo apt-get update
sudo apt-get install -y clang-16
sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-16 128
- name: Set up Pgrx
- name: Set up PostgreSQL
run: |
# pg_config
mkdir -p ~/.pg_config
touch ~/.pg_config/pg_config
chmod 777 ~/.pg_config/pg_config
echo "#!/usr/bin/env bash" >> ~/.pg_config/pg_config
echo "$(pwd)/tools/pg_config.sh \"\$@\" < $(pwd)/vendor/pg_config/pg${VERSION}_${ARCH}-unknown-linux-gnu.txt" >> ~/.pg_config/pg_config
mkdir -p ~/.pgrx && echo "configs.pg$VERSION=\"$HOME/.pg_config/pg_config\"" > ~/.pgrx/config.toml
# pgrx_binding
mkdir -p ~/.pgrx_binding
cp ./vendor/pgrx_binding/pg${VERSION}_$(uname --machine)-unknown-linux-gnu.rs ~/.pgrx_binding/pg${VERSION}_raw_bindings.rs
echo PGRX_TARGET_INFO_PATH_PG$VERSION=$HOME/.pgrx_binding >> "$GITHUB_ENV"
mkdir -p ~/.pg14/pg_binding && touch ~/.pg14/pg_config && chmod 777 ~/.pg14/pg_config
mkdir -p ~/.pg15/pg_binding && touch ~/.pg15/pg_config && chmod 777 ~/.pg15/pg_config
mkdir -p ~/.pg16/pg_binding && touch ~/.pg16/pg_config && chmod 777 ~/.pg16/pg_config
cp ./vendor/pgrx_binding/pg14_${ARCH}-unknown-linux-gnu.rs ~/.pg14/pg_binding/pg14_raw_bindings.rs
cp ./vendor/pgrx_binding/pg15_${ARCH}-unknown-linux-gnu.rs ~/.pg15/pg_binding/pg15_raw_bindings.rs
cp ./vendor/pgrx_binding/pg16_${ARCH}-unknown-linux-gnu.rs ~/.pg16/pg_binding/pg16_raw_bindings.rs
echo "#!/usr/bin/env bash" >> ~/.pg14/pg_config
echo "$(pwd)/tools/pg_config.sh \"\$@\" < $(pwd)/vendor/pg_config/pg14_${ARCH}-unknown-linux-gnu.txt" >> ~/.pg14/pg_config
echo "#!/usr/bin/env bash" >> ~/.pg15/pg_config
echo "$(pwd)/tools/pg_config.sh \"\$@\" < $(pwd)/vendor/pg_config/pg15_${ARCH}-unknown-linux-gnu.txt" >> ~/.pg15/pg_config
echo "#!/usr/bin/env bash" >> ~/.pg16/pg_config
echo "$(pwd)/tools/pg_config.sh \"\$@\" < $(pwd)/vendor/pg_config/pg16_${ARCH}-unknown-linux-gnu.txt" >> ~/.pg16/pg_config
- name: Clippy
run: |
cargo clippy --workspace --exclude pgvectors --exclude pyvectors --target $ARCH-unknown-linux-gnu
export PGRX_PG_CONFIG_PATH=$HOME/.pg14/pg_config
export PGRX_TARGET_INFO_PATH_PG14=$HOME/.pg14/pg_binding
cargo clippy --package pgvectors --features pg14 --no-deps --target $ARCH-unknown-linux-gnu
export PGRX_PG_CONFIG_PATH=$HOME/.pg15/pg_config
export PGRX_TARGET_INFO_PATH_PG15=$HOME/.pg15/pg_binding
cargo clippy --package pgvectors --features pg15 --no-deps --target $ARCH-unknown-linux-gnu
export PGRX_PG_CONFIG_PATH=$HOME/.pg16/pg_config
export PGRX_TARGET_INFO_PATH_PG16=$HOME/.pg16/pg_binding
cargo clippy --package pgvectors --features pg16 --no-deps --target $ARCH-unknown-linux-gnu
- name: Build
run: |
cargo build --workspace --exclude pgvectors --exclude pyvectors --target $ARCH-unknown-linux-gnu
export PGRX_PG_CONFIG_PATH=$HOME/.pg14/pg_config
export PGRX_TARGET_INFO_PATH_PG14=$HOME/.pg14/pg_binding
cargo build --package pgvectors --lib --features pg14 --target $ARCH-unknown-linux-gnu
export PGRX_PG_CONFIG_PATH=$HOME/.pg15/pg_config
export PGRX_TARGET_INFO_PATH_PG15=$HOME/.pg15/pg_binding
cargo build --package pgvectors --lib --features pg15 --target $ARCH-unknown-linux-gnu
export PGRX_PG_CONFIG_PATH=$HOME/.pg16/pg_config
export PGRX_TARGET_INFO_PATH_PG16=$HOME/.pg16/pg_binding
cargo build --package pgvectors --lib --features pg16 --target $ARCH-unknown-linux-gnu
- name: Test
run: cargo test --all --no-fail-fast --features "pg$VERSION" --target $ARCH-unknown-linux-gnu -- --nocapture
run: |
cargo test --workspace --exclude pgvectors --exclude pyvectors --no-fail-fast --target $ARCH-unknown-linux-gnu
- name: Test (x86_64)
if: matrix.arch == 'x86_64'
run: |
ASSETS=$(mktemp -d)
wget https://downloadmirror.intel.com/813591/sde-external-9.33.0-2024-01-07-lin.tar.xz -O $ASSETS/sde-external.tar.xz
tar -xf $ASSETS/sde-external.tar.xz -C $ASSETS
cargo --config "target.x86_64-unknown-linux-gnu.runner = [\"$ASSETS/sde-external-9.33.0-2024-01-07-lin/sde64\", \"-spr\", \"--\"]" test "_v4" --all --no-fail-fast --features "pg$VERSION" --target $ARCH-unknown-linux-gnu -- --nocapture
cargo --config "target.x86_64-unknown-linux-gnu.runner = [\"$ASSETS/sde-external-9.33.0-2024-01-07-lin/sde64\", \"-spr\", \"--\"]" test "_v4" --workspace --exclude pgvectors --exclude pyvectors --no-fail-fast --target $ARCH-unknown-linux-gnu
- name: Post Set up Cache
uses: actions/cache/save@v4
if: ${{ !steps.cache.outputs.cache-hit }}

249
Cargo.lock generated
View File

@ -827,11 +827,12 @@ dependencies = [
[[package]]
name = "dashmap"
version = "5.5.3"
version = "6.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856"
checksum = "804c8821570c3f8b70230c2ba75ffa5c0f9a4189b9a432b6656c536712acae28"
dependencies = [
"cfg-if",
"crossbeam-utils",
"hashbrown",
"lock_api",
"once_cell",
@ -1242,6 +1243,12 @@ dependencies = [
"stable_deref_trait",
]
[[package]]
name = "heck"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
[[package]]
name = "hermit-abi"
version = "0.1.19"
@ -1527,24 +1534,30 @@ dependencies = [
"crc32fast",
"crossbeam",
"dashmap",
"flat",
"hnsw",
"inverted",
"ivf",
"k_means",
"indexing",
"log",
"parking_lot",
"quantization",
"rabitq",
"rand",
"serde",
"serde_json",
"stoppable_rayon",
"storage",
"thiserror",
"validator",
]
[[package]]
name = "indexing"
version = "0.0.0"
dependencies = [
"base",
"flat",
"hnsw",
"inverted",
"ivf",
"rabitq",
"thiserror",
]
[[package]]
name = "indexmap"
version = "2.3.0"
@ -1555,6 +1568,12 @@ dependencies = [
"hashbrown",
]
[[package]]
name = "indoc"
version = "2.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5"
[[package]]
name = "instant"
version = "0.1.13"
@ -1825,6 +1844,15 @@ dependencies = [
"libc",
]
[[package]]
name = "memoffset"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a"
dependencies = [
"autocfg",
]
[[package]]
name = "mime"
version = "0.3.17"
@ -1887,6 +1915,19 @@ dependencies = [
"syn 2.0.72",
]
[[package]]
name = "ndarray"
version = "0.15.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32"
dependencies = [
"matrixmultiply",
"num-complex",
"num-integer",
"num-traits",
"rawpointer",
]
[[package]]
name = "new_debug_unreachable"
version = "1.0.6"
@ -1952,6 +1993,21 @@ dependencies = [
"libm",
]
[[package]]
name = "numpy"
version = "0.21.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec170733ca37175f5d75a5bea5911d6ff45d2cd52849ce98b685394e4f2f37f4"
dependencies = [
"libc",
"ndarray",
"num-complex",
"num-integer",
"num-traits",
"pyo3",
"rustc-hash 1.1.0",
]
[[package]]
name = "object"
version = "0.36.3"
@ -2137,6 +2193,38 @@ dependencies = [
"unescape",
]
[[package]]
name = "pgvectors"
version = "0.0.0"
dependencies = [
"arrayvec",
"base",
"bincode",
"byteorder",
"chrono",
"detect",
"embedding",
"interprocess_atomic_wait",
"libc",
"log",
"memfd",
"memmap2",
"num-traits",
"paste",
"pgrx",
"rand",
"rustix 0.38.34",
"scopeguard",
"send_fd",
"serde",
"serde_json",
"service",
"thiserror",
"tikv-jemallocator",
"toml",
"validator",
]
[[package]]
name = "phf_shared"
version = "0.10.0"
@ -2226,6 +2314,12 @@ dependencies = [
"windows-sys 0.52.0",
]
[[package]]
name = "portable-atomic"
version = "1.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da544ee218f0d287a911e9c99a39a8c9bc8fcad3cb8db5959940044ecfc67265"
[[package]]
name = "ppv-lite86"
version = "0.2.20"
@ -2274,6 +2368,97 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "pyo3"
version = "0.21.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a5e00b96a521718e08e03b1a622f01c8a8deb50719335de3f60b3b3950f069d8"
dependencies = [
"cfg-if",
"indoc",
"libc",
"memoffset",
"parking_lot",
"portable-atomic",
"pyo3-build-config",
"pyo3-ffi",
"pyo3-macros",
"unindent",
]
[[package]]
name = "pyo3-build-config"
version = "0.21.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7883df5835fafdad87c0d888b266c8ec0f4c9ca48a5bed6bbb592e8dedee1b50"
dependencies = [
"once_cell",
"target-lexicon",
]
[[package]]
name = "pyo3-ffi"
version = "0.21.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "01be5843dc60b916ab4dad1dca6d20b9b4e6ddc8e15f50c47fe6d85f1fb97403"
dependencies = [
"libc",
"pyo3-build-config",
]
[[package]]
name = "pyo3-macros"
version = "0.21.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77b34069fc0682e11b31dbd10321cbf94808394c56fd996796ce45217dfac53c"
dependencies = [
"proc-macro2",
"pyo3-macros-backend",
"quote",
"syn 2.0.72",
]
[[package]]
name = "pyo3-macros-backend"
version = "0.21.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08260721f32db5e1a5beae69a55553f56b99bd0e1c3e6e0a5e8851a9d0f5a85c"
dependencies = [
"heck",
"proc-macro2",
"pyo3-build-config",
"quote",
"syn 2.0.72",
]
[[package]]
name = "pythonize"
version = "0.21.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9d0664248812c38cc55a4ed07f88e4df516ce82604b93b1ffdc041aa77a6cb3c"
dependencies = [
"pyo3",
"serde",
]
[[package]]
name = "pyvectors"
version = "0.0.0"
dependencies = [
"base",
"detect",
"indexing",
"ndarray",
"num-traits",
"numpy",
"pyo3",
"pythonize",
"serde",
"serde_json",
"stoppable_rayon",
"validator",
]
[[package]]
name = "quantization"
version = "0.0.0"
@ -2982,6 +3167,12 @@ version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
[[package]]
name = "target-lexicon"
version = "0.12.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
[[package]]
name = "term"
version = "0.7.0"
@ -3242,6 +3433,12 @@ version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c"
[[package]]
name = "unindent"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce"
[[package]]
name = "untrusted"
version = "0.9.0"
@ -3310,38 +3507,6 @@ version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a84c137d37ab0142f0f2ddfe332651fdbf252e7b7dbb4e67b6c1f1b2e925101"
[[package]]
name = "vectors"
version = "0.0.0"
dependencies = [
"arrayvec",
"base",
"bincode",
"byteorder",
"chrono",
"detect",
"embedding",
"interprocess_atomic_wait",
"libc",
"log",
"memfd",
"memmap2",
"num-traits",
"paste",
"pgrx",
"rand",
"rustix 0.38.34",
"scopeguard",
"send_fd",
"serde",
"serde_json",
"service",
"thiserror",
"tikv-jemallocator",
"toml",
"validator",
]
[[package]]
name = "version_check"
version = "0.9.5"

View File

@ -1,13 +1,14 @@
[package]
name = "vectors"
name = "pgvectors"
version.workspace = true
edition.workspace = true
[lib]
name = "vectors"
crate-type = ["cdylib", "lib"]
[[bin]]
name = "pgrx_embed_vectors"
name = "pgrx_embed_pgvectors"
path = "./src/bin/pgrx_embed.rs"
[features]
@ -49,13 +50,8 @@ tikv-jemallocator = { version = "0.6.0", features = [
"disable_initial_exec_tls",
] }
[patch.crates-io]
pgrx = { git = "https://github.com/tensorchord/pgrx.git", branch = "v0.12.0-alpha.1-patch3" }
[lints]
rust.unsafe_op_in_unsafe_fn = "forbid"
rust.unused_lifetimes = "warn"
rust.unused_qualifications = "warn"
workspace = true
[workspace]
resolver = "2"
@ -90,6 +86,9 @@ rust.unsafe_op_in_unsafe_fn = "deny"
rust.unused_lifetimes = "warn"
rust.unused_qualifications = "warn"
[patch.crates-io]
pgrx = { git = "https://github.com/tensorchord/pgrx.git", branch = "v0.12.0-alpha.1-patch3" }
[profile.opt]
inherits = "dev"
opt-level = 3

View File

@ -560,31 +560,106 @@ impl Default for ProductQuantizationOptions {
}
}
#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
#[derive(Debug, Clone, Serialize, Deserialize, Validate, Alter)]
pub struct SearchOptions {
#[serde(default = "SearchOptions::default_flat_sq_rerank_size")]
#[validate(range(min = 0, max = 65535))]
pub flat_sq_rerank_size: u32,
#[serde(default = "SearchOptions::default_flat_sq_fast_scan")]
pub flat_sq_fast_scan: bool,
#[serde(default = "SearchOptions::default_flat_pq_rerank_size")]
#[validate(range(min = 0, max = 65535))]
pub flat_pq_rerank_size: u32,
#[serde(default = "SearchOptions::default_flat_pq_fast_scan")]
pub flat_pq_fast_scan: bool,
#[serde(default = "SearchOptions::default_ivf_sq_rerank_size")]
#[validate(range(min = 0, max = 65535))]
pub ivf_sq_rerank_size: u32,
#[serde(default = "SearchOptions::default_ivf_sq_fast_scan")]
pub ivf_sq_fast_scan: bool,
#[serde(default = "SearchOptions::default_ivf_pq_rerank_size")]
#[validate(range(min = 0, max = 65535))]
pub ivf_pq_rerank_size: u32,
#[serde(default = "SearchOptions::default_ivf_pq_fast_scan")]
pub ivf_pq_fast_scan: bool,
#[serde(default = "SearchOptions::default_ivf_nprobe")]
#[validate(range(min = 1, max = 65535))]
pub ivf_nprobe: u32,
#[serde(default = "SearchOptions::default_hnsw_ef_search")]
#[validate(range(min = 1, max = 65535))]
pub hnsw_ef_search: u32,
#[serde(default = "SearchOptions::default_rabitq_nprobe")]
#[validate(range(min = 1, max = 65535))]
pub rabitq_nprobe: u32,
#[serde(default = "SearchOptions::default_rabitq_fast_scan")]
pub rabitq_fast_scan: bool,
#[serde(default = "SearchOptions::default_diskann_ef_search")]
#[validate(range(min = 1, max = 65535))]
pub diskann_ef_search: u32,
}
impl SearchOptions {
pub const fn default_flat_sq_rerank_size() -> u32 {
0
}
pub const fn default_flat_sq_fast_scan() -> bool {
false
}
pub const fn default_flat_pq_rerank_size() -> u32 {
0
}
pub const fn default_flat_pq_fast_scan() -> bool {
false
}
pub const fn default_ivf_sq_rerank_size() -> u32 {
0
}
pub const fn default_ivf_sq_fast_scan() -> bool {
false
}
pub const fn default_ivf_pq_rerank_size() -> u32 {
0
}
pub const fn default_ivf_pq_fast_scan() -> bool {
false
}
pub const fn default_ivf_nprobe() -> u32 {
10
}
pub const fn default_hnsw_ef_search() -> u32 {
100
}
pub const fn default_rabitq_nprobe() -> u32 {
10
}
pub const fn default_rabitq_fast_scan() -> bool {
true
}
pub const fn default_diskann_ef_search() -> u32 {
100
}
}
impl Default for SearchOptions {
fn default() -> Self {
Self {
flat_sq_rerank_size: Self::default_flat_sq_rerank_size(),
flat_sq_fast_scan: Self::default_flat_sq_fast_scan(),
flat_pq_rerank_size: Self::default_flat_pq_rerank_size(),
flat_pq_fast_scan: Self::default_flat_pq_fast_scan(),
ivf_sq_rerank_size: Self::default_ivf_sq_rerank_size(),
ivf_sq_fast_scan: Self::default_ivf_sq_fast_scan(),
ivf_pq_rerank_size: Self::default_ivf_pq_rerank_size(),
ivf_pq_fast_scan: Self::default_ivf_pq_fast_scan(),
ivf_nprobe: Self::default_ivf_nprobe(),
hnsw_ef_search: Self::default_hnsw_ef_search(),
rabitq_nprobe: Self::default_rabitq_nprobe(),
rabitq_fast_scan: Self::default_rabitq_fast_scan(),
diskann_ef_search: Self::default_diskann_ef_search(),
}
}
}
#[derive(Debug, Serialize, Deserialize)]
pub struct IndexStat {
pub indexing: bool,

View File

@ -1,6 +1,6 @@
use crate::always_equal::AlwaysEqual;
use crate::operator::{Borrowed, Operator};
use crate::scalar::F32;
use crate::vector::VectorOwned;
use serde::{Deserialize, Serialize};
use std::any::Any;
use std::fmt::Display;
@ -73,17 +73,17 @@ pub struct Element {
pub payload: AlwaysEqual<Payload>,
}
pub trait Vectors<O: Operator>: Send + Sync {
pub trait Vectors<V: VectorOwned> {
fn dims(&self) -> u32;
fn len(&self) -> u32;
fn vector(&self, i: u32) -> Borrowed<'_, O>;
fn vector(&self, i: u32) -> V::Borrowed<'_>;
}
pub trait Collection<O: Operator>: Vectors<O> {
pub trait Collection {
fn payload(&self, i: u32) -> Payload;
}
pub trait Source<O: Operator>: Collection<O> {
pub trait Source {
fn get_main<T: Any>(&self) -> Option<&T>;
fn get_main_len(&self) -> u32;
fn check_existing(&self, i: u32) -> bool;

View File

@ -1,5 +1,5 @@
use base::operator::*;
use base::search::*;
use base::vector::VectorOwned;
use std::marker::PhantomData;
pub fn remap(
@ -36,14 +36,14 @@ pub fn remap(
remap
}
pub struct RemappedCollection<'a, O: Operator, C: Collection<O>> {
pub struct RemappedCollection<'a, V: VectorOwned, C: Collection> {
collection: &'a C,
remap: Vec<u32>,
barrier: u32,
_phantom: PhantomData<fn(O) -> O>,
_phantom: PhantomData<fn(V) -> V>,
}
impl<'a, O: Operator, S: Source<O>> RemappedCollection<'a, O, S> {
impl<'a, V: VectorOwned, S: Vectors<V> + Collection + Source> RemappedCollection<'a, V, S> {
pub fn from_source(source: &'a S) -> Self {
let barrier = source.get_main_len();
let remap = remap(source.len(), barrier, |i| source.check_existing(i));
@ -56,7 +56,7 @@ impl<'a, O: Operator, S: Source<O>> RemappedCollection<'a, O, S> {
}
}
impl<'a, O: Operator, C: Collection<O>> RemappedCollection<'a, O, C> {
impl<'a, V: VectorOwned, C: Vectors<V> + Collection> RemappedCollection<'a, V, C> {
pub fn from_collection(collection: &'a C, remap: Vec<u32>) -> Self {
assert_eq!(remap.len(), collection.len() as usize);
let barrier = collection.len();
@ -69,7 +69,7 @@ impl<'a, O: Operator, C: Collection<O>> RemappedCollection<'a, O, C> {
}
}
impl<'a, O: Operator, C: Collection<O>> RemappedCollection<'a, O, C> {
impl<V: VectorOwned, C: Collection> RemappedCollection<'_, V, C> {
#[inline(always)]
pub fn skip(&self, x: u32) -> bool {
x < self.barrier && (x as usize) < self.remap.len() && self.remap[x as usize] == x
@ -80,7 +80,7 @@ impl<'a, O: Operator, C: Collection<O>> RemappedCollection<'a, O, C> {
}
}
impl<O: Operator, C: Collection<O>> Vectors<O> for RemappedCollection<'_, O, C> {
impl<V: VectorOwned, C: Vectors<V> + Collection> Vectors<V> for RemappedCollection<'_, V, C> {
fn dims(&self) -> u32 {
self.collection.dims()
}
@ -89,12 +89,12 @@ impl<O: Operator, C: Collection<O>> Vectors<O> for RemappedCollection<'_, O, C>
self.remap.len() as u32
}
fn vector(&self, i: u32) -> Borrowed<'_, O> {
fn vector(&self, i: u32) -> V::Borrowed<'_> {
self.collection.vector(self.remap[i as usize])
}
}
impl<O: Operator, C: Collection<O>> Collection<O> for RemappedCollection<'_, O, C> {
impl<V: VectorOwned, C: Collection> Collection for RemappedCollection<'_, V, C> {
fn payload(&self, i: u32) -> Payload {
self.collection.payload(self.remap[i as usize])
}

View File

@ -8,7 +8,7 @@ use base::vector::VectorOwned;
const SAMPLES: usize = 65536;
pub fn sample<O: Operator>(vectors: &impl Vectors<O>) -> Vec2<Scalar<O>> {
pub fn sample<O: Operator>(vectors: &impl Vectors<Owned<O>>) -> Vec2<Scalar<O>> {
let n = vectors.len();
let m = std::cmp::min(SAMPLES as u32, n);
let f = super::rand::sample_u32(&mut rand::thread_rng(), n, m);
@ -20,7 +20,7 @@ pub fn sample<O: Operator>(vectors: &impl Vectors<O>) -> Vec2<Scalar<O>> {
samples
}
pub fn sample_cast<O: Operator>(vectors: &impl Vectors<O>) -> Vec2<F32> {
pub fn sample_cast<O: Operator>(vectors: &impl Vectors<Owned<O>>) -> Vec2<F32> {
let n = vectors.len();
let m = std::cmp::min(SAMPLES as u32, n);
let f = super::rand::sample_u32(&mut rand::thread_rng(), n, m);
@ -38,7 +38,7 @@ pub fn sample_cast<O: Operator>(vectors: &impl Vectors<O>) -> Vec2<F32> {
}
pub fn sample_subvector_transform<O: Operator>(
vectors: &impl Vectors<O>,
vectors: &impl Vectors<Owned<O>>,
s: usize,
e: usize,
transform: impl Fn(Borrowed<'_, O>) -> Owned<O>,

View File

@ -25,7 +25,11 @@ pub struct Flat<O: OperatorFlat> {
}
impl<O: OperatorFlat> Flat<O> {
pub fn create(path: impl AsRef<Path>, options: IndexOptions, source: &impl Source<O>) -> Self {
pub fn create(
path: impl AsRef<Path>,
options: IndexOptions,
source: &(impl Vectors<Owned<O>> + Collection + Source + Sync),
) -> Self {
let remapped = RemappedCollection::from_source(source);
from_nothing(path, options, &remapped)
}
@ -62,6 +66,10 @@ impl<O: OperatorFlat> Flat<O> {
}))
}
pub fn dims(&self) -> u32 {
self.storage.dims()
}
pub fn len(&self) -> u32 {
self.storage.len()
}
@ -78,12 +86,12 @@ impl<O: OperatorFlat> Flat<O> {
fn from_nothing<O: OperatorFlat>(
path: impl AsRef<Path>,
options: IndexOptions,
collection: &impl Collection<O>,
collection: &(impl Vectors<Owned<O>> + Collection + Sync),
) -> Flat<O> {
create_dir(path.as_ref()).unwrap();
let flat_indexing_options = options.indexing.clone().unwrap_flat();
let storage = O::Storage::create(path.as_ref().join("storage"), collection);
let quantization = Quantization::create(
let quantization = Quantization::<O>::create(
path.as_ref().join("quantization"),
options.vector,
flat_indexing_options.quantization,

View File

@ -46,7 +46,7 @@ impl<O: OperatorHnsw> Hnsw<O> {
pub fn create(
path: impl AsRef<Path>,
options: IndexOptions,
source: &(impl Source<O> + Sync),
source: &(impl Vectors<Owned<O>> + Collection + Source + Sync),
) -> Self {
let remapped = RemappedCollection::from_source(source);
if let Some(main) = source.get_main::<Self>() {
@ -97,6 +97,10 @@ impl<O: OperatorHnsw> Hnsw<O> {
)
}
pub fn dims(&self) -> u32 {
self.storage.dims()
}
pub fn len(&self) -> u32 {
self.storage.len()
}
@ -113,7 +117,7 @@ impl<O: OperatorHnsw> Hnsw<O> {
fn from_nothing<O: OperatorHnsw>(
path: impl AsRef<Path>,
options: IndexOptions,
collection: &(impl Collection<O> + Sync),
collection: &(impl Vectors<Owned<O>> + Collection + Sync),
) -> Hnsw<O> {
create_dir(path.as_ref()).unwrap();
let HnswIndexingOptions {
@ -134,7 +138,7 @@ fn from_nothing<O: OperatorHnsw>(
finish(&mut g, m);
let storage = O::Storage::create(path.as_ref().join("storage"), collection);
rayon::check();
let quantization = Quantization::create(
let quantization = Quantization::<O>::create(
path.as_ref().join("quantization"),
options.vector,
quantization_options,
@ -195,7 +199,7 @@ fn from_nothing<O: OperatorHnsw>(
fn from_main<O: OperatorHnsw>(
path: impl AsRef<Path>,
options: IndexOptions,
remapped: &RemappedCollection<O, impl Collection<O> + Sync>,
remapped: &RemappedCollection<Owned<O>, impl Vectors<Owned<O>> + Collection + Sync>,
main: &Hnsw<O>,
) -> Hnsw<O> {
create_dir(path.as_ref()).unwrap();
@ -232,7 +236,7 @@ fn from_main<O: OperatorHnsw>(
finish(&mut g, m);
let storage = O::Storage::create(path.as_ref().join("storage"), remapped);
rayon::check();
let quantization = Quantization::create(
let quantization = Quantization::<O>::create(
path.as_ref().join("quantization"),
options.vector,
quantization_options,

View File

@ -9,7 +9,7 @@ bincode.workspace = true
byteorder.workspace = true
crc32fast = "1.4.0"
crossbeam = "0.8.4"
dashmap = "5.5.3"
dashmap = "6.0.1"
log.workspace = true
parking_lot.workspace = true
rand.workspace = true
@ -20,17 +20,8 @@ validator.workspace = true
base = { path = "../base" }
common = { path = "../common" }
k_means = { path = "../k_means" }
quantization = { path = "../quantization" }
indexing = { path = "../indexing" }
stoppable_rayon = { path = "../stoppable_rayon" }
storage = { path = "../storage" }
# algorithms
flat = { path = "../flat" }
hnsw = { path = "../hnsw" }
inverted = { path = "../inverted" }
ivf = { path = "../ivf" }
rabitq = { path = "../rabitq" }
[lints]
workspace = true

View File

@ -1,52 +0,0 @@
use crate::Op;
use base::index::*;
use base::operator::*;
use base::search::*;
use std::convert::Infallible;
use thiserror::Error;
#[derive(Debug, Error)]
#[error("`GrowingIndexing` is read-only.")]
pub struct GrowingIndexingInsertError;
pub enum GrowingIndexing<O: Op> {
Infallible(Infallible, fn(O) -> O),
}
impl<O: Op> GrowingIndexing<O> {
pub fn new(_: VectorOptions, _: usize) -> Self {
unimplemented!()
}
pub fn is_full(&self) -> bool {
unimplemented!()
}
pub fn seal(&self) {
unimplemented!()
}
pub fn insert(&self, _: O::VectorOwned, _: Payload) -> Result<(), GrowingIndexingInsertError> {
unimplemented!()
}
pub fn vbase<'a>(
&'a self,
_: Borrowed<'a, O>,
_: &'a SearchOptions,
) -> Box<dyn Iterator<Item = Element> + 'a> {
unimplemented!()
}
pub fn len(&self) -> u32 {
unimplemented!()
}
pub fn vector(&self, _i: u32) -> Borrowed<'_, O> {
unimplemented!()
}
pub fn payload(&self, _i: u32) -> Payload {
unimplemented!()
}
}

View File

@ -1,2 +0,0 @@
pub mod growing;
pub mod sealed;

View File

@ -1,7 +1,6 @@
#![allow(clippy::len_without_is_empty)]
pub mod delete;
pub mod indexing;
pub mod optimizing;
pub mod segment;
@ -25,11 +24,8 @@ use common::dir_ops::sync_walk_from_dir;
use common::file_atomic::FileAtomic;
use crossbeam::atomic::AtomicCell;
use crossbeam::channel::Sender;
use inverted::operator::OperatorInvertedIndex;
use ivf::operator::OperatorIvf;
use indexing::OperatorIndexing;
use parking_lot::Mutex;
use quantization::operator::OperatorQuantization;
use rabitq::operator::OperatorRabitq;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::collections::HashSet;
@ -39,30 +35,12 @@ use std::path::PathBuf;
use std::sync::Arc;
use std::thread::JoinHandle;
use std::time::Instant;
use storage::OperatorStorage;
use thiserror::Error;
use validator::Validate;
pub trait Op:
Operator
+ OperatorQuantization
+ OperatorStorage
+ OperatorIvf
+ OperatorInvertedIndex
+ OperatorRabitq
{
}
pub trait Op: OperatorIndexing {}
impl<
T: Operator
+ OperatorQuantization
+ OperatorStorage
+ OperatorIvf
+ OperatorInvertedIndex
+ OperatorRabitq,
> Op for T
{
}
impl<T: OperatorIndexing> Op for T {}
#[derive(Debug, Error)]
#[error("The index view is outdated.")]
@ -338,7 +316,7 @@ impl<O: Op> Index<O> {
}
pub fn create_sealed_segment(
&self,
source: &(impl Source<O> + Sync),
source: &(impl Vectors<Owned<O>> + Collection + Source + Sync),
sealed_segment_ids: &[NonZeroU128],
growing_segment_ids: &[NonZeroU128],
) -> Option<Arc<SealedSegment<O>>> {

View File

@ -2,20 +2,22 @@ use crate::delete::Delete;
use crate::Op;
use crate::{GrowingSegment, SealedSegment};
use base::index::IndexOptions;
use base::operator::Borrowed;
use base::operator::{Borrowed, Owned};
use base::search::*;
use std::any::Any;
use std::fmt::Debug;
use std::marker::PhantomData;
use std::sync::Arc;
pub struct IndexSource<O: Op> {
pub struct IndexSource<V, O: Op> {
pub(super) sealed: Option<Arc<SealedSegment<O>>>,
pub(super) growing: Vec<Arc<GrowingSegment<O>>>,
pub(super) dims: u32,
pub(super) delete: Arc<Delete>,
_phantom: PhantomData<fn(V) -> V>,
}
impl<O: Op> IndexSource<O> {
impl<O: Op> IndexSource<Owned<O>, O> {
pub fn new(
options: IndexOptions,
sealed: Option<Arc<SealedSegment<O>>>,
@ -27,11 +29,12 @@ impl<O: Op> IndexSource<O> {
growing,
dims: options.vector.dims,
delete,
_phantom: PhantomData,
}
}
}
impl<O: Op> Vectors<O> for IndexSource<O> {
impl<O: Op> Vectors<Owned<O>> for IndexSource<Owned<O>, O> {
fn dims(&self) -> u32 {
self.dims
}
@ -58,7 +61,7 @@ impl<O: Op> Vectors<O> for IndexSource<O> {
}
}
impl<O: Op> Collection<O> for IndexSource<O> {
impl<O: Op> Collection for IndexSource<Owned<O>, O> {
fn payload(&self, mut index: u32) -> Payload {
for x in self.sealed.iter() {
if index < x.len() {
@ -76,7 +79,7 @@ impl<O: Op> Collection<O> for IndexSource<O> {
}
}
impl<O: Op> Source<O> for IndexSource<O> {
impl<O: Op> Source for IndexSource<Owned<O>, O> {
fn get_main<T: Any>(&self) -> Option<&T> {
let x = self.sealed.as_ref()?;
Some(
@ -95,12 +98,13 @@ impl<O: Op> Source<O> for IndexSource<O> {
}
}
pub struct RoGrowingCollection<O: Op> {
pub struct RoGrowingCollection<V, O: Op> {
pub(super) growing: Vec<Arc<GrowingSegment<O>>>,
pub(super) dims: u32,
_phantom: PhantomData<fn(V) -> V>,
}
impl<O: Op> Debug for RoGrowingCollection<O> {
impl<O: Op> Debug for RoGrowingCollection<Owned<O>, O> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("RoGrowingCollection")
.field("growing", &self.growing)
@ -109,7 +113,7 @@ impl<O: Op> Debug for RoGrowingCollection<O> {
}
}
impl<O: Op> Vectors<O> for RoGrowingCollection<O> {
impl<O: Op> Vectors<Owned<O>> for RoGrowingCollection<Owned<O>, O> {
fn dims(&self) -> u32 {
self.dims
}
@ -129,7 +133,7 @@ impl<O: Op> Vectors<O> for RoGrowingCollection<O> {
}
}
impl<O: Op> Collection<O> for RoGrowingCollection<O> {
impl<O: Op> Collection for RoGrowingCollection<Owned<O>, O> {
fn payload(&self, mut index: u32) -> Payload {
for x in self.growing.iter() {
if index < x.len() {

View File

@ -1,13 +1,14 @@
use crate::optimizing::index_source::IndexSource;
use crate::Index;
use crate::Op;
use base::operator::Owned;
use std::sync::Arc;
pub fn scan<O: Op>(
index: Arc<Index<O>>,
capacity: u32,
delete_threshold: f64,
) -> Option<IndexSource<O>> {
) -> Option<IndexSource<Owned<O>, O>> {
let (sealed, growing) = 'a: {
let protect = index.protect.lock();
// approach 1: merge small segments to a big segment
@ -86,7 +87,7 @@ pub fn scan<O: Op>(
))
}
pub fn make<O: Op>(index: Arc<Index<O>>, source: IndexSource<O>) {
pub fn make<O: Op>(index: Arc<Index<O>>, source: IndexSource<Owned<O>, O>) {
let _ = index.create_sealed_segment(
&source,
&source.sealed.iter().map(|x| x.id()).collect::<Vec<_>>(),

View File

@ -1,4 +1,3 @@
use crate::indexing::sealed::SealedIndexing;
use crate::utils::dir_ops::dir_size;
use crate::IndexTracker;
use crate::Op;
@ -6,6 +5,7 @@ use base::index::*;
use base::operator::*;
use base::search::*;
use crossbeam::atomic::AtomicCell;
use indexing::SealedIndexing;
use std::any::Any;
use std::fmt::Debug;
use std::num::NonZeroU128;
@ -37,7 +37,7 @@ impl<O: Op> SealedSegment<O> {
path: PathBuf,
id: NonZeroU128,
options: IndexOptions,
source: &(impl Source<O> + Sync),
source: &(impl Vectors<Owned<O>> + Collection + Source + Sync),
) -> Arc<Self> {
let indexing = SealedIndexing::create(&path, options, source);
Arc::new(Self {

View File

@ -0,0 +1,19 @@
[package]
name = "indexing"
version.workspace = true
edition.workspace = true
[dependencies]
thiserror.workspace = true
base = { path = "../base" }
# algorithms
flat = { path = "../flat" }
hnsw = { path = "../hnsw" }
inverted = { path = "../inverted" }
ivf = { path = "../ivf" }
rabitq = { path = "../rabitq" }
[lints]
workspace = true

View File

@ -0,0 +1,15 @@
pub mod sealed;
pub use sealed::SealedIndexing;
use base::operator::Operator;
use inverted::operator::OperatorInvertedIndex;
use ivf::operator::OperatorIvf;
use rabitq::operator::OperatorRabitq;
pub trait OperatorIndexing:
Operator + OperatorIvf + OperatorInvertedIndex + OperatorRabitq
{
}
impl<T: Operator + OperatorIvf + OperatorInvertedIndex + OperatorRabitq> OperatorIndexing for T {}

View File

@ -1,4 +1,4 @@
use crate::Op;
use crate::OperatorIndexing;
use base::index::*;
use base::operator::*;
use base::search::*;
@ -9,7 +9,7 @@ use ivf::Ivf;
use rabitq::Rabitq;
use std::path::Path;
pub enum SealedIndexing<O: Op> {
pub enum SealedIndexing<O: OperatorIndexing> {
Flat(Flat<O>),
Ivf(Ivf<O>),
Hnsw(Hnsw<O>),
@ -17,11 +17,11 @@ pub enum SealedIndexing<O: Op> {
Rabitq(Rabitq<O>),
}
impl<O: Op> SealedIndexing<O> {
impl<O: OperatorIndexing> SealedIndexing<O> {
pub fn create(
path: impl AsRef<Path>,
options: IndexOptions,
source: &(impl Source<O> + Sync),
source: &(impl Vectors<Owned<O>> + Collection + Source + Sync),
) -> Self {
match options.indexing {
IndexingOptions::Flat(_) => Self::Flat(Flat::create(path, options, source)),
@ -57,8 +57,20 @@ impl<O: Op> SealedIndexing<O> {
SealedIndexing::Rabitq(x) => x.vbase(vector, opts),
}
}
}
pub fn len(&self) -> u32 {
impl<O: OperatorIndexing> Vectors<Owned<O>> for SealedIndexing<O> {
fn dims(&self) -> u32 {
match self {
SealedIndexing::Flat(x) => x.dims(),
SealedIndexing::Ivf(x) => x.dims(),
SealedIndexing::Hnsw(x) => x.dims(),
SealedIndexing::InvertedIndex(x) => x.dims(),
SealedIndexing::Rabitq(x) => x.dims(),
}
}
fn len(&self) -> u32 {
match self {
SealedIndexing::Flat(x) => x.len(),
SealedIndexing::Ivf(x) => x.len(),
@ -68,7 +80,7 @@ impl<O: Op> SealedIndexing<O> {
}
}
pub fn vector(&self, i: u32) -> Borrowed<'_, O> {
fn vector(&self, i: u32) -> Borrowed<'_, O> {
match self {
SealedIndexing::Flat(x) => x.vector(i),
SealedIndexing::Ivf(x) => x.vector(i),
@ -77,8 +89,10 @@ impl<O: Op> SealedIndexing<O> {
SealedIndexing::Rabitq(x) => x.vector(i),
}
}
}
pub fn payload(&self, i: u32) -> Payload {
impl<O: OperatorIndexing> Collection for SealedIndexing<O> {
fn payload(&self, i: u32) -> Payload {
match self {
SealedIndexing::Flat(x) => x.payload(i),
SealedIndexing::Ivf(x) => x.payload(i),

View File

@ -5,7 +5,7 @@ pub mod operator;
use self::operator::OperatorInvertedIndex;
use base::always_equal::AlwaysEqual;
use base::index::{IndexOptions, SearchOptions};
use base::operator::Borrowed;
use base::operator::{Borrowed, Owned};
use base::scalar::{ScalarLike, F32};
use base::search::{Collection, Element, Payload, Source, Vectors};
use common::json::Json;
@ -29,7 +29,11 @@ pub struct InvertedIndex<O: OperatorInvertedIndex> {
}
impl<O: OperatorInvertedIndex> InvertedIndex<O> {
pub fn create(path: impl AsRef<Path>, options: IndexOptions, source: &impl Source<O>) -> Self {
pub fn create(
path: impl AsRef<Path>,
options: IndexOptions,
source: &(impl Vectors<Owned<O>> + Collection + Source),
) -> Self {
let remapped = RemappedCollection::from_source(source);
from_nothing(path, options, &remapped)
}
@ -65,6 +69,10 @@ impl<O: OperatorInvertedIndex> InvertedIndex<O> {
}))
}
pub fn dims(&self) -> u32 {
self.storage.dims()
}
pub fn len(&self) -> u32 {
self.storage.len()
}
@ -81,7 +89,7 @@ impl<O: OperatorInvertedIndex> InvertedIndex<O> {
fn from_nothing<O: OperatorInvertedIndex>(
path: impl AsRef<Path>,
opts: IndexOptions,
collection: &impl Collection<O>,
collection: &(impl Vectors<Owned<O>> + Collection),
) -> InvertedIndex<O> {
create_dir(path.as_ref()).expect("failed to create path for inverted index");

View File

@ -26,7 +26,11 @@ pub struct IvfNaive<O: Op> {
}
impl<O: Op> IvfNaive<O> {
pub fn create(path: impl AsRef<Path>, options: IndexOptions, source: &impl Source<O>) -> Self {
pub fn create(
path: impl AsRef<Path>,
options: IndexOptions,
source: &(impl Vectors<Owned<O>> + Collection + Source + Sync),
) -> Self {
let remapped = RemappedCollection::from_source(source);
from_nothing(path, options, &remapped)
}
@ -35,6 +39,10 @@ impl<O: Op> IvfNaive<O> {
open(path)
}
pub fn dims(&self) -> u32 {
self.storage.dims()
}
pub fn len(&self) -> u32 {
self.storage.len()
}
@ -87,7 +95,7 @@ impl<O: Op> IvfNaive<O> {
fn from_nothing<O: Op>(
path: impl AsRef<Path>,
options: IndexOptions,
collection: &impl Collection<O>,
collection: &(impl Vectors<Owned<O>> + Collection + Sync),
) -> IvfNaive<O> {
create_dir(path.as_ref()).unwrap();
let IvfIndexingOptions {
@ -96,7 +104,7 @@ fn from_nothing<O: Op>(
residual_quantization: _,
quantization: quantization_options,
} = options.indexing.clone().unwrap_ivf();
let samples = common::sample::sample(collection);
let samples = common::sample::sample::<O>(collection);
rayon::check();
let centroids = k_means(nlist as usize, samples, spherical_centroids);
rayon::check();
@ -115,7 +123,7 @@ fn from_nothing<O: Op>(
let collection = RemappedCollection::from_collection(collection, remap);
rayon::check();
let storage = O::Storage::create(path.as_ref().join("storage"), &collection);
let quantization = Quantization::create(
let quantization = Quantization::<O>::create(
path.as_ref().join("quantization"),
options.vector,
quantization_options,

View File

@ -26,7 +26,11 @@ pub struct IvfResidual<O: Op> {
}
impl<O: Op> IvfResidual<O> {
pub fn create(path: impl AsRef<Path>, options: IndexOptions, source: &impl Source<O>) -> Self {
pub fn create(
path: impl AsRef<Path>,
options: IndexOptions,
source: &(impl Vectors<Owned<O>> + Collection + Source + Sync),
) -> Self {
let remapped = RemappedCollection::from_source(source);
from_nothing(path, options, &remapped)
}
@ -35,6 +39,10 @@ impl<O: Op> IvfResidual<O> {
open(path)
}
pub fn dims(&self) -> u32 {
self.storage.dims()
}
pub fn len(&self) -> u32 {
self.storage.len()
}
@ -89,7 +97,7 @@ impl<O: Op> IvfResidual<O> {
fn from_nothing<O: Op>(
path: impl AsRef<Path>,
options: IndexOptions,
collection: &impl Collection<O>,
collection: &(impl Vectors<Owned<O>> + Collection + Sync),
) -> IvfResidual<O> {
create_dir(path.as_ref()).unwrap();
let IvfIndexingOptions {
@ -98,7 +106,7 @@ fn from_nothing<O: Op>(
residual_quantization: _,
quantization: quantization_options,
} = options.indexing.clone().unwrap_ivf();
let samples = common::sample::sample(collection);
let samples = common::sample::sample::<O>(collection);
rayon::check();
let centroids = k_means(nlist as usize, samples, spherical_centroids);
rayon::check();
@ -117,7 +125,7 @@ fn from_nothing<O: Op>(
let collection = RemappedCollection::from_collection(collection, remap);
rayon::check();
let storage = O::Storage::create(path.as_ref().join("storage"), &collection);
let quantization = Quantization::create(
let quantization = Quantization::<O>::create(
path.as_ref().join("quantization"),
options.vector,
quantization_options,

View File

@ -20,7 +20,11 @@ pub enum Ivf<O: OperatorIvf> {
}
impl<O: OperatorIvf> Ivf<O> {
pub fn create(path: impl AsRef<Path>, options: IndexOptions, source: &impl Source<O>) -> Self {
pub fn create(
path: impl AsRef<Path>,
options: IndexOptions,
source: &(impl Vectors<Owned<O>> + Collection + Source + Sync),
) -> Self {
let IvfIndexingOptions {
quantization: quantization_options,
residual_quantization,
@ -54,6 +58,13 @@ impl<O: OperatorIvf> Ivf<O> {
}
}
pub fn dims(&self) -> u32 {
match self {
Ivf::Naive(x) => x.dims(),
Ivf::Residual(x) => x.dims(),
}
}
pub fn len(&self) -> u32 {
match self {
Ivf::Naive(x) => x.len(),

View File

@ -0,0 +1,30 @@
[package]
name = "pyvectors"
version.workspace = true
edition.workspace = true
[lib]
name = "pyvectors"
crate-type = ["cdylib"]
[dependencies]
num-traits.workspace = true
serde.workspace = true
serde_json.workspace = true
validator.workspace = true
# python
ndarray = { version = "0.15" }
numpy = { version = "0.21" }
pyo3 = { version = "0.21", features = ["extension-module"] }
pythonize = "0.21"
base = { path = "../base" }
detect = { path = "../detect" }
indexing = { path = "../indexing" }
stoppable_rayon = { path = "../stoppable_rayon" }
[lints]
rust.unsafe_op_in_unsafe_fn = "allow"
rust.unused_lifetimes = "warn"
rust.unused_qualifications = "warn"

View File

@ -0,0 +1,9 @@
[build-system]
requires = ["maturin>=1,<2"]
build-backend = "maturin"
[tool.maturin]
module-name = "vectors"
[project]
name = "vectors"

View File

@ -0,0 +1,41 @@
use base::scalar::F32;
use base::search::Vectors;
use base::vector::{Vecf32Borrowed, Vecf32Owned};
use ndarray::{s, ArrayView2};
pub struct Dataset<'a> {
underlying: ArrayView2<'a, f32>,
}
impl<'a> Dataset<'a> {
pub fn new(dataset: ArrayView2<'a, f32>) -> Self {
assert!(1 <= dataset.dim().1 && dataset.dim().1 <= 65535);
assert!(dataset.dim().1 <= u32::MAX as usize);
assert!(dataset.dim().0 <= u32::MAX as usize);
Self {
underlying: dataset,
}
}
}
impl<'a> Vectors<Vecf32Owned> for Dataset<'a> {
fn dims(&self) -> u32 {
self.underlying.dim().1 as _
}
fn len(&self) -> u32 {
self.underlying.dim().0 as _
}
fn vector(&self, i: u32) -> Vecf32Borrowed<'_> {
let s = self
.underlying
.slice(s!(i as usize, ..))
.to_slice()
.expect("memory is non continuous");
fn cast(x: &[f32]) -> &[F32] {
unsafe { std::mem::transmute(x) }
}
Vecf32Borrowed::new(cast(s))
}
}

View File

@ -0,0 +1,122 @@
use base::distance::DistanceKind;
use base::index::{IndexOptions, SearchOptions};
use base::operator::*;
use base::scalar::F32;
use base::search::{Collection, Element, Pointer, Source, Vectors};
use base::vector::*;
use std::path::Path;
#[allow(dead_code)]
pub enum Indexing {
Vecf32Dot(indexing::SealedIndexing<Vecf32Dot>),
Vecf32L2(indexing::SealedIndexing<Vecf32L2>),
Vecf16Dot(indexing::SealedIndexing<Vecf16Dot>),
Vecf16L2(indexing::SealedIndexing<Vecf16L2>),
BVectorDot(indexing::SealedIndexing<BVectorDot>),
BVectorHamming(indexing::SealedIndexing<BVectorHamming>),
BVectorJaccard(indexing::SealedIndexing<BVectorJaccard>),
SVecf32Dot(indexing::SealedIndexing<SVecf32Dot>),
SVecf32L2(indexing::SealedIndexing<SVecf32L2>),
}
impl Indexing {
pub fn create(
path: impl AsRef<Path>,
index_options: IndexOptions,
source: impl Vectors<Vecf32Owned> + Collection + Source + Sync,
) -> Self {
let path = path.as_ref();
match (index_options.vector.v, index_options.vector.d) {
(VectorKind::Vecf32, DistanceKind::L2) => Self::Vecf32L2(
stoppable_rayon::ThreadPoolBuilder::new()
.build_scoped(|pool| {
pool.install(|| {
let x = indexing::SealedIndexing::create(
&path,
index_options.clone(),
&source,
);
// write options
std::fs::write(
path.join(".index_options"),
serde_json::to_string(&index_options).unwrap(),
)
.unwrap();
x
})
})
.unwrap()
.unwrap(),
),
(VectorKind::Vecf32, DistanceKind::Dot) => Self::Vecf32Dot(
stoppable_rayon::ThreadPoolBuilder::new()
.build_scoped(|pool| {
pool.install(|| {
let x = indexing::SealedIndexing::create(
&path,
index_options.clone(),
&source,
);
// write options
std::fs::write(
path.join(".index_options"),
serde_json::to_string(&index_options).unwrap(),
)
.unwrap();
x
})
})
.unwrap()
.unwrap(),
),
_ => unimplemented!(),
}
}
pub fn open(path: impl AsRef<Path>) -> Self {
let path = path.as_ref();
// read options
let index_options: IndexOptions =
serde_json::from_slice(&std::fs::read(path.join(".index_options")).unwrap()).unwrap();
match (index_options.vector.v, index_options.vector.d) {
(VectorKind::Vecf32, DistanceKind::L2) => {
Self::Vecf32L2(indexing::SealedIndexing::open(path, index_options))
}
(VectorKind::Vecf32, DistanceKind::Dot) => {
Self::Vecf32Dot(indexing::SealedIndexing::open(path, index_options))
}
_ => unimplemented!(),
}
}
pub fn vbase<'a>(
&'a self,
vector: BorrowedVector<'a>,
opts: &'a SearchOptions,
) -> impl Iterator<Item = (F32, Pointer)> + 'a {
match (self, vector) {
(Self::Vecf32L2(x), BorrowedVector::Vecf32(vector)) => x.vbase(vector, opts),
(Self::Vecf32Dot(x), BorrowedVector::Vecf32(vector)) => x.vbase(vector, opts),
(Self::Vecf16Dot(x), BorrowedVector::Vecf16(vector)) => x.vbase(vector, opts),
(Self::Vecf16L2(x), BorrowedVector::Vecf16(vector)) => x.vbase(vector, opts),
(Self::BVectorDot(x), BorrowedVector::BVector(vector)) => x.vbase(vector, opts),
(Self::BVectorHamming(x), BorrowedVector::BVector(vector)) => x.vbase(vector, opts),
(Self::BVectorJaccard(x), BorrowedVector::BVector(vector)) => x.vbase(vector, opts),
(Self::SVecf32Dot(x), BorrowedVector::SVecf32(vector)) => x.vbase(vector, opts),
(Self::SVecf32L2(x), BorrowedVector::SVecf32(vector)) => x.vbase(vector, opts),
_ => panic!("invalid vector type"),
}
.map(|Element { distance, payload }| (distance, payload.0.pointer()))
}
pub fn dims(&self) -> u32 {
match self {
Indexing::Vecf32Dot(x) => x.dims(),
Indexing::Vecf32L2(x) => x.dims(),
Indexing::Vecf16Dot(x) => x.dims(),
Indexing::Vecf16L2(x) => x.dims(),
Indexing::BVectorDot(x) => x.dims(),
Indexing::BVectorHamming(x) => x.dims(),
Indexing::BVectorJaccard(x) => x.dims(),
Indexing::SVecf32Dot(x) => x.dims(),
Indexing::SVecf32L2(x) => x.dims(),
}
}
}

117
crates/pyvectors/src/lib.rs Normal file
View File

@ -0,0 +1,117 @@
mod dataset;
mod indexing;
mod with_labels;
use base::distance::DistanceKind;
use base::index::*;
use base::search::Vectors;
use base::vector::{BorrowedVector, VectorKind};
use dataset::Dataset;
use ndarray::{Array1, Array2};
use numpy::{IntoPyArray, PyArray2, PyReadonlyArray1, PyReadonlyArray2};
use pyo3::prelude::*;
use pyo3::types::PyDict;
use std::path::PathBuf;
use validator::Validate;
use with_labels::WithLabels;
#[pymodule]
fn vectors(m: &Bound<'_, PyModule>) -> PyResult<()> {
detect::init();
m.add_class::<Indexing>()?;
Ok(())
}
#[pyclass]
pub struct Indexing(indexing::Indexing);
#[pymethods]
impl Indexing {
#[staticmethod]
#[pyo3(signature = (path, distance, dims, dataset, labels, **indexing_options))]
pub fn create(
path: &str,
distance: &str,
dims: u32,
dataset: PyReadonlyArray2<'_, f32>,
labels: PyReadonlyArray1<'_, i64>,
indexing_options: Option<Bound<'_, PyDict>>,
) -> Self {
// path
let path = PathBuf::from(path);
assert_eq!(std::fs::exists(&path).ok(), Some(false), "file exists");
// distance, dims
assert!(matches!(dims, 1..=65535));
let vector_options = VectorOptions {
dims,
v: VectorKind::Vecf32,
d: match distance {
"dot" => DistanceKind::Dot,
"l2" => DistanceKind::L2,
"hamming" => DistanceKind::Hamming,
"jaccard" => DistanceKind::Jaccard,
_ => unimplemented!("distance type {distance} is not implemented"),
},
};
vector_options.validate().expect("not valid vector options");
// dataset
let dataset = dataset.as_array();
assert!(dataset.dim().1 == dims as usize, "bad dataset");
let dataset = Dataset::new(dataset);
let source = WithLabels::new(
dataset,
labels.as_slice().expect("memory is non continuous"),
);
// indexing_options
let indexing_options: IndexingOptions = indexing_options
.map(|obj| pythonize::depythonize_bound(obj.into_any()).expect("failed to deserialize"))
.unwrap_or_default();
let index_options = IndexOptions {
vector: vector_options,
indexing: indexing_options,
};
index_options.validate().expect("not valid index options");
// build
Self(indexing::Indexing::create(
&path,
index_options.clone(),
source,
))
}
#[staticmethod]
pub fn open(path: &str) -> Self {
Self(indexing::Indexing::open(path))
}
#[pyo3(signature = (dataset, k, **search_options))]
pub fn search<'py>(
&self,
py: Python<'py>,
dataset: PyReadonlyArray2<'py, f32>,
k: u32,
search_options: Option<Bound<'py, PyDict>>,
) -> (Bound<'py, PyArray2<f32>>, Bound<'py, PyArray2<i64>>) {
// dataset
let dataset = dataset.as_array();
assert!(dataset.dim().1 == self.0.dims() as usize, "bad dataset");
let dataset = Dataset::new(dataset);
// search_options
let search_options: SearchOptions = search_options
.map(|obj| pythonize::depythonize_bound(obj.into_any()).expect("failed to deserialize"))
.unwrap_or_default();
// results
let mut d = Array2::zeros((0, k as usize));
let mut l = Array2::zeros((0, k as usize));
for i in 0..dataset.len() {
let (distances, labels) = self
.0
.vbase(BorrowedVector::Vecf32(dataset.vector(i)), &search_options)
.map(|(distance, label)| (distance.0, label.as_u64() as i64))
.chain(std::iter::repeat((f32::INFINITY, i64::MAX)))
.take(k as usize)
.unzip::<_, _, Vec<_>, Vec<_>>();
d.push_row(Array1::from_vec(distances).view()).unwrap();
l.push_row(Array1::from_vec(labels).view()).unwrap();
}
(d.into_pyarray_bound(py), l.into_pyarray_bound(py))
}
}

View File

@ -0,0 +1,48 @@
use base::search::{Collection, Payload, Pointer, Source, Vectors};
use base::vector::*;
pub struct WithLabels<'a, T> {
dataset: T,
labels: &'a [i64],
}
impl<'a, T: Vectors<Vecf32Owned>> WithLabels<'a, T> {
pub fn new(dataset: T, labels: &'a [i64]) -> Self {
assert!(dataset.len() as usize == labels.len());
Self { dataset, labels }
}
}
impl<'a, V: VectorOwned, T: Vectors<V>> Vectors<V> for WithLabels<'a, T> {
fn dims(&self) -> u32 {
self.dataset.dims()
}
fn len(&self) -> u32 {
self.dataset.len()
}
fn vector(&self, i: u32) -> V::Borrowed<'_> {
self.dataset.vector(i)
}
}
impl<T> Collection for WithLabels<'_, T> {
fn payload(&self, i: u32) -> Payload {
Payload::new(Pointer::new(self.labels[i as usize] as u64), 0)
}
}
impl<T> Source for WithLabels<'_, T> {
fn get_main<X: std::any::Any>(&self) -> Option<&X> {
None
}
fn get_main_len(&self) -> u32 {
0
}
fn check_existing(&self, _: u32) -> bool {
true
}
}

View File

@ -45,7 +45,7 @@ impl<O: OperatorQuantization> Quantizer<O> {
pub fn train(
vector_options: VectorOptions,
quantization_options: QuantizationOptions,
vectors: &impl Vectors<O>,
vectors: &(impl Vectors<Owned<O>> + Sync),
transform: impl Fn(Borrowed<'_, O>) -> Owned<O> + Copy + Send + Sync,
) -> Self {
use QuantizationOptions::*;
@ -91,7 +91,7 @@ impl<O: OperatorQuantization> Quantization<O> {
path: impl AsRef<Path>,
vector_options: VectorOptions,
quantization_options: QuantizationOptions,
vectors: &impl Vectors<O>,
vectors: &(impl Vectors<Owned<O>> + Sync),
transform: impl Fn(Borrowed<'_, O>) -> Owned<O> + Copy + Send + Sync,
) -> Self {
std::fs::create_dir(path.as_ref()).unwrap();
@ -240,7 +240,7 @@ impl<O: OperatorQuantization> Quantization<O> {
pub fn process(
&self,
vectors: &impl Vectors<O>,
vectors: &impl Vectors<Owned<O>>,
preprocessed: &QuantizationPreprocessed<O>,
u: u32,
) -> F32 {

View File

@ -33,7 +33,7 @@ impl<O: OperatorProductQuantization> ProductQuantizer<O> {
pub fn train(
vector_options: VectorOptions,
product_quantization_options: ProductQuantizationOptions,
vectors: &impl Vectors<O>,
vectors: &(impl Vectors<Owned<O>> + Sync),
transform: impl Fn(Borrowed<'_, O>) -> Owned<O> + Copy + Send + Sync,
) -> Self {
let dims = vector_options.dims;
@ -46,7 +46,7 @@ impl<O: OperatorProductQuantization> ProductQuantizer<O> {
let subdims = std::cmp::min(ratio, dims - ratio * p);
let start = (p * ratio) as usize;
let end = start + subdims as usize;
let subsamples = sample_subvector_transform(vectors, start, end, transform);
let subsamples = sample_subvector_transform::<O>(vectors, start, end, transform);
k_means(1 << bits, subsamples, false)
})
.collect::<Vec<_>>();

View File

@ -33,7 +33,7 @@ impl<O: OperatorScalarQuantization> ScalarQuantizer<O> {
pub fn train(
vector_options: VectorOptions,
scalar_quantization_options: ScalarQuantizationOptions,
vectors: &impl Vectors<O>,
vectors: &impl Vectors<Owned<O>>,
transform: impl Fn(Borrowed<'_, O>) -> Owned<O> + Copy,
) -> Self {
let dims = vector_options.dims;

View File

@ -26,7 +26,7 @@ impl<O: OperatorTrivialQuantization> TrivialQuantizer<O> {
pub fn train(
vector_options: VectorOptions,
_: TrivialQuantizationOptions,
_: &impl Vectors<O>,
_: &impl Vectors<Owned<O>>,
_: impl Fn(Borrowed<'_, O>) -> Owned<O> + Copy,
) -> Self {
Self {

View File

@ -11,7 +11,7 @@ use crate::operator::OperatorRabitq as Op;
use crate::quant::quantization::Quantization;
use base::always_equal::AlwaysEqual;
use base::index::{IndexOptions, RabitqIndexingOptions, SearchOptions};
use base::operator::Borrowed;
use base::operator::{Borrowed, Owned};
use base::scalar::F32;
use base::search::RerankerPop;
use base::search::{Collection, Element, Payload, Source, Vectors};
@ -35,7 +35,11 @@ pub struct Rabitq<O: Op> {
}
impl<O: Op> Rabitq<O> {
pub fn create(path: impl AsRef<Path>, options: IndexOptions, source: &impl Source<O>) -> Self {
pub fn create(
path: impl AsRef<Path>,
options: IndexOptions,
source: &(impl Vectors<Owned<O>> + Collection + Source),
) -> Self {
let remapped = RemappedCollection::from_source(source);
from_nothing(path, options, &remapped)
}
@ -44,6 +48,10 @@ impl<O: Op> Rabitq<O> {
open(path)
}
pub fn dims(&self) -> u32 {
self.storage.dims()
}
pub fn len(&self) -> u32 {
self.storage.len()
}
@ -96,7 +104,7 @@ impl<O: Op> Rabitq<O> {
fn from_nothing<O: Op>(
path: impl AsRef<Path>,
options: IndexOptions,
collection: &impl Collection<O>,
collection: &(impl Vectors<Owned<O>> + Collection),
) -> Rabitq<O> {
create_dir(path.as_ref()).unwrap();
let RabitqIndexingOptions { nlist } = options.indexing.clone().unwrap_rabitq();
@ -119,7 +127,7 @@ fn from_nothing<O: Op>(
}
projection
};
let samples = common::sample::sample_cast(collection);
let samples = common::sample::sample_cast::<O>(collection);
rayon::check();
let centroids: Vec2<F32> = k_means(nlist as usize, samples, false);
rayon::check();

View File

@ -1,5 +1,4 @@
use crate::Storage;
use base::operator::Operator;
use base::search::*;
use base::vector::*;
use common::json::Json;
@ -12,7 +11,7 @@ pub struct BVectorStorage {
slice: MmapArray<u64>,
}
impl<O: Operator<VectorOwned = BVectorOwned>> Vectors<O> for BVectorStorage {
impl Vectors<BVectorOwned> for BVectorStorage {
fn dims(&self) -> u32 {
*self.dims
}
@ -29,8 +28,8 @@ impl<O: Operator<VectorOwned = BVectorOwned>> Vectors<O> for BVectorStorage {
}
}
impl<O: Operator<VectorOwned = BVectorOwned>> Storage<O> for BVectorStorage {
fn create(path: impl AsRef<Path>, vectors: &impl Vectors<O>) -> Self {
impl Storage<BVectorOwned> for BVectorStorage {
fn create(path: impl AsRef<Path>, vectors: &impl Vectors<BVectorOwned>) -> Self {
std::fs::create_dir(path.as_ref()).unwrap();
let dims = Json::create(path.as_ref().join("dims"), vectors.dims());
let len = Json::create(path.as_ref().join("len"), vectors.len());

View File

@ -5,15 +5,16 @@ mod vec;
use base::operator::*;
use base::scalar::*;
use base::search::*;
use base::vector::VectorOwned;
use std::path::Path;
pub trait Storage<O: Operator>: Vectors<O> {
pub trait Storage<V: VectorOwned>: Vectors<V> {
fn open(path: impl AsRef<Path>) -> Self;
fn create(path: impl AsRef<Path>, vectors: &impl Vectors<O>) -> Self;
fn create(path: impl AsRef<Path>, vectors: &impl Vectors<V>) -> Self;
}
pub trait OperatorStorage: Operator {
type Storage: Storage<Self> + Send + Sync;
type Storage: Storage<Owned<Self>> + Send + Sync;
}
impl OperatorStorage for SVecf32Dot {

View File

@ -1,5 +1,4 @@
use crate::Storage;
use base::operator::Operator;
use base::scalar::*;
use base::search::*;
use base::vector::*;
@ -15,7 +14,7 @@ pub struct SVecStorage {
offsets: MmapArray<usize>,
}
impl<O: Operator<VectorOwned = SVecf32Owned>> Vectors<O> for SVecStorage {
impl Vectors<SVecf32Owned> for SVecStorage {
fn dims(&self) -> u32 {
*self.dims
}
@ -33,8 +32,8 @@ impl<O: Operator<VectorOwned = SVecf32Owned>> Vectors<O> for SVecStorage {
}
}
impl<O: Operator<VectorOwned = SVecf32Owned>> Storage<O> for SVecStorage {
fn create(path: impl AsRef<Path>, vectors: &impl Vectors<O>) -> Self {
impl Storage<SVecf32Owned> for SVecStorage {
fn create(path: impl AsRef<Path>, vectors: &impl Vectors<SVecf32Owned>) -> Self {
std::fs::create_dir(path.as_ref()).unwrap();
let dims = Json::create(path.as_ref().join("dims"), vectors.dims());
let len = Json::create(path.as_ref().join("len"), vectors.len());

View File

@ -1,5 +1,4 @@
use crate::Storage;
use base::operator::Operator;
use base::scalar::*;
use base::search::*;
use base::vector::*;
@ -13,7 +12,7 @@ pub struct VecStorage<T> {
slice: MmapArray<T>,
}
impl<O: Operator<VectorOwned = Vecf32Owned>> Vectors<O> for VecStorage<F32> {
impl Vectors<Vecf32Owned> for VecStorage<F32> {
fn dims(&self) -> u32 {
*self.dims
}
@ -29,8 +28,8 @@ impl<O: Operator<VectorOwned = Vecf32Owned>> Vectors<O> for VecStorage<F32> {
}
}
impl<O: Operator<VectorOwned = Vecf32Owned>> Storage<O> for VecStorage<F32> {
fn create(path: impl AsRef<Path>, vectors: &impl Vectors<O>) -> Self {
impl Storage<Vecf32Owned> for VecStorage<F32> {
fn create(path: impl AsRef<Path>, vectors: &impl Vectors<Vecf32Owned>) -> Self {
std::fs::create_dir(path.as_ref()).unwrap();
let dims = Json::create(path.as_ref().join("dims"), vectors.dims());
let len = Json::create(path.as_ref().join("len"), vectors.len());
@ -49,7 +48,7 @@ impl<O: Operator<VectorOwned = Vecf32Owned>> Storage<O> for VecStorage<F32> {
}
}
impl<O: Operator<VectorOwned = Vecf16Owned>> Vectors<O> for VecStorage<F16> {
impl Vectors<Vecf16Owned> for VecStorage<F16> {
fn dims(&self) -> u32 {
*self.dims
}
@ -65,8 +64,8 @@ impl<O: Operator<VectorOwned = Vecf16Owned>> Vectors<O> for VecStorage<F16> {
}
}
impl<O: Operator<VectorOwned = Vecf16Owned>> Storage<O> for VecStorage<F16> {
fn create(path: impl AsRef<Path>, vectors: &impl Vectors<O>) -> Self {
impl Storage<Vecf16Owned> for VecStorage<F16> {
fn create(path: impl AsRef<Path>, vectors: &impl Vectors<Vecf16Owned>) -> Self {
std::fs::create_dir(path.as_ref()).unwrap();
let dims = Json::create(path.as_ref().join("dims"), vectors.dims());
let len = Json::create(path.as_ref().join("len"), vectors.len());

View File

@ -13,7 +13,7 @@ rm -rf ./build/vectors-pg${VERSION}_${SEMVER}_${PLATFORM}.deb
mkdir -p ./build/dir_zip
cp -a ./sql/upgrade/. ./build/dir_zip/
cp ./target/vectors--$SEMVER.sql ./build/dir_zip/vectors--$SEMVER.sql
cp ./target/schema.sql ./build/dir_zip/vectors--$SEMVER.sql
sed -e "s/@CARGO_VERSION@/$SEMVER/g" < ./vectors.control > ./build/dir_zip/vectors.control
cp ./target/${ARCH}-unknown-linux-gnu/release/libvectors.so ./build/dir_zip/vectors.so
zip ./build/vectors-pg${VERSION}_${ARCH}-unknown-linux-gnu_${SEMVER}.zip -j ./build/dir_zip/*

View File

@ -1,31 +1,44 @@
use base::index::*;
use pgrx::guc::{GucContext, GucFlags, GucRegistry, GucSetting};
static FLAT_SQ_RERANK_SIZE: GucSetting<i32> = GucSetting::<i32>::new(0);
static FLAT_SQ_RERANK_SIZE: GucSetting<i32> =
GucSetting::<i32>::new(SearchOptions::default_flat_sq_rerank_size() as i32);
static FLAT_SQ_FAST_SCAN: GucSetting<bool> = GucSetting::<bool>::new(false);
static FLAT_SQ_FAST_SCAN: GucSetting<bool> =
GucSetting::<bool>::new(SearchOptions::default_flat_sq_fast_scan());
static FLAT_PQ_RERANK_SIZE: GucSetting<i32> = GucSetting::<i32>::new(0);
static FLAT_PQ_RERANK_SIZE: GucSetting<i32> =
GucSetting::<i32>::new(SearchOptions::default_flat_pq_rerank_size() as i32);
static FLAT_PQ_FAST_SCAN: GucSetting<bool> = GucSetting::<bool>::new(false);
static FLAT_PQ_FAST_SCAN: GucSetting<bool> =
GucSetting::<bool>::new(SearchOptions::default_flat_pq_fast_scan());
static IVF_SQ_RERANK_SIZE: GucSetting<i32> = GucSetting::<i32>::new(0);
static IVF_SQ_RERANK_SIZE: GucSetting<i32> =
GucSetting::<i32>::new(SearchOptions::default_ivf_sq_rerank_size() as i32);
static IVF_SQ_FAST_SCAN: GucSetting<bool> = GucSetting::<bool>::new(false);
static IVF_SQ_FAST_SCAN: GucSetting<bool> =
GucSetting::<bool>::new(SearchOptions::default_ivf_sq_fast_scan());
static IVF_PQ_RERANK_SIZE: GucSetting<i32> = GucSetting::<i32>::new(0);
static IVF_PQ_RERANK_SIZE: GucSetting<i32> =
GucSetting::<i32>::new(SearchOptions::default_ivf_pq_rerank_size() as i32);
static IVF_PQ_FAST_SCAN: GucSetting<bool> = GucSetting::<bool>::new(false);
static IVF_PQ_FAST_SCAN: GucSetting<bool> =
GucSetting::<bool>::new(SearchOptions::default_ivf_pq_fast_scan());
static IVF_NPROBE: GucSetting<i32> = GucSetting::<i32>::new(10);
static IVF_NPROBE: GucSetting<i32> =
GucSetting::<i32>::new(SearchOptions::default_ivf_nprobe() as i32);
static HNSW_EF_SEARCH: GucSetting<i32> = GucSetting::<i32>::new(100);
static HNSW_EF_SEARCH: GucSetting<i32> =
GucSetting::<i32>::new(SearchOptions::default_hnsw_ef_search() as i32);
static RABITQ_NPROBE: GucSetting<i32> = GucSetting::<i32>::new(10);
static RABITQ_NPROBE: GucSetting<i32> =
GucSetting::<i32>::new(SearchOptions::default_rabitq_nprobe() as i32);
static RABITQ_FAST_SCAN: GucSetting<bool> = GucSetting::<bool>::new(true);
static RABITQ_FAST_SCAN: GucSetting<bool> =
GucSetting::<bool>::new(SearchOptions::default_rabitq_fast_scan());
static DISKANN_EF_SEARCH: GucSetting<i32> = GucSetting::<i32>::new(100);
static DISKANN_EF_SEARCH: GucSetting<i32> =
GucSetting::<i32>::new(SearchOptions::default_diskann_ef_search() as i32);
pub unsafe fn init() {
GucRegistry::define_int_guc(

View File

@ -10,11 +10,10 @@ if [[ " $@ " =~ --target' '([^ ]+) ]]; then
DIR="./target/$TARGET/debug"
fi
else
TARGET=""
if [[ " $@ " =~ " --release " ]]; then
DIR="./target/release"
elif [[ " $@ " =~ " --profile opt " ]]; then
DIR="./target/$TARGET/opt"
DIR="./target/opt"
else
DIR="./target/debug"
fi
@ -43,6 +42,6 @@ code=$(mktemp)
chmod 700 $code
CONTROL_FILEPATH="./vectors.control" SO_FILEPATH="$DIR/libvectors.so" $(dirname "$0")/schema-codegen.sh >> $code
PGRX_EMBED=$code cargo rustc --bin pgrx_embed_vectors "$@" -- --cfg pgrx_embed
PGRX_EMBED=$code cargo rustc --package pgvectors --bin pgrx_embed_pgvectors "$@" -- --cfg pgrx_embed
CARGO_PKG_VERSION="0.0.0" QEMU_LD_PREFIX=$QEMU_LD_PREFIX "${RUNNER[@]}" "$DIR/pgrx_embed_vectors" | expand -t 4
CARGO_PKG_VERSION="0.0.0" QEMU_LD_PREFIX=$QEMU_LD_PREFIX "${RUNNER[@]}" "$DIR/pgrx_embed_pgvectors" | expand -t 4