From 01740b64a1472721060211d5e1e961dfbb237da8 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 15 Jul 2022 12:50:42 -0400 Subject: [PATCH 01/12] Import https://github.com/influxdata/object_store_rs/commit/3c51870ac41a90942c2e45bb499a893d514ed1da --- object_store_rs/.circleci/config.yml | 262 +++++ object_store_rs/.github/dependabot.yml | 15 + .../.github/workflows/semantic.yml | 10 + object_store_rs/.gitignore | 11 + object_store_rs/.kodiak.toml | 2 + object_store_rs/CONTRIBUTING.md | 96 ++ object_store_rs/Cargo.toml | 62 ++ object_store_rs/LICENSE-APACHE | 201 ++++ object_store_rs/LICENSE-MIT | 25 + object_store_rs/README.md | 5 + object_store_rs/SECURITY.md | 5 + object_store_rs/deny.toml | 28 + object_store_rs/rust-toolchain.toml | 3 + object_store_rs/rustfmt.toml | 6 + object_store_rs/src/aws.rs | 996 ++++++++++++++++++ object_store_rs/src/azure.rs | 613 +++++++++++ object_store_rs/src/gcp.rs | 682 ++++++++++++ object_store_rs/src/lib.rs | 678 ++++++++++++ object_store_rs/src/local.rs | 746 +++++++++++++ object_store_rs/src/memory.rs | 277 +++++ object_store_rs/src/oauth.rs | 198 ++++ object_store_rs/src/path/mod.rs | 504 +++++++++ object_store_rs/src/path/parts.rs | 131 +++ object_store_rs/src/throttle.rs | 498 +++++++++ object_store_rs/src/token.rs | 47 + object_store_rs/src/util.rs | 56 + 26 files changed, 6157 insertions(+) create mode 100644 object_store_rs/.circleci/config.yml create mode 100644 object_store_rs/.github/dependabot.yml create mode 100644 object_store_rs/.github/workflows/semantic.yml create mode 100644 object_store_rs/.gitignore create mode 100644 object_store_rs/.kodiak.toml create mode 100644 object_store_rs/CONTRIBUTING.md create mode 100644 object_store_rs/Cargo.toml create mode 100644 object_store_rs/LICENSE-APACHE create mode 100644 object_store_rs/LICENSE-MIT create mode 100644 object_store_rs/README.md create mode 100644 object_store_rs/SECURITY.md create mode 100644 object_store_rs/deny.toml create mode 100644 object_store_rs/rust-toolchain.toml create mode 100644 object_store_rs/rustfmt.toml create mode 100644 object_store_rs/src/aws.rs create mode 100644 object_store_rs/src/azure.rs create mode 100644 object_store_rs/src/gcp.rs create mode 100644 object_store_rs/src/lib.rs create mode 100644 object_store_rs/src/local.rs create mode 100644 object_store_rs/src/memory.rs create mode 100644 object_store_rs/src/oauth.rs create mode 100644 object_store_rs/src/path/mod.rs create mode 100644 object_store_rs/src/path/parts.rs create mode 100644 object_store_rs/src/throttle.rs create mode 100644 object_store_rs/src/token.rs create mode 100644 object_store_rs/src/util.rs diff --git a/object_store_rs/.circleci/config.yml b/object_store_rs/.circleci/config.yml new file mode 100644 index 000000000000..b4dff6d53acc --- /dev/null +++ b/object_store_rs/.circleci/config.yml @@ -0,0 +1,262 @@ +--- +# CI Overview +# ----------- +# +# Each night: +# +# A build image is created (ci_image) from `docker/Dockerfile.ci` and is +# pushed to `quay.io/influxdb/rust:ci`. This build image is then used to run +# the CI tasks for the day. +# +# Every commit: +# +# The CI for every PR and merge to main runs tests, fmt, lints and compiles debug binaries +# +# On main if all these checks pass it will then additionally compile in "release" mode and +# publish a docker image to quay.io/influxdb/iox:$COMMIT_SHA +# +# Manual CI Image: +# +# It is possible to manually trigger a rebuild of the image used in CI. To do this, navigate to +# https://app.circleci.com/pipelines/github/influxdata/influxdb_iox?branch=main (overriding the +# branch name if desired). Then: +# - Click "Run Pipeline" in the top-right +# - Expand "Add Parameters" +# - Add a "boolean" parameter called "ci_image" with the value true +# - Click "Run Pipeline" +# +# If you refresh the page you should see a newly running ci_image workflow +# + +version: 2.1 + +orbs: + win: circleci/windows@4.1 + +commands: + rust_components: + description: Verify installed components + steps: + - run: + name: Verify installed components + command: | + rustup --version + rustup show + cargo fmt --version + cargo clippy --version + + cache_restore: + description: Restore Cargo Cache + steps: + - restore_cache: + name: Restoring Cargo Cache + keys: + - cargo-cache-{{ arch }}-{{ .Branch }}-{{ checksum "Cargo.lock" }} + - cargo-cache-{{ arch }}-{{ .Branch }} + - cargo-cache + cache_save: + description: Save Cargo Cache + steps: + - save_cache: + name: Save Cargo Cache + paths: + - /usr/local/cargo/registry + key: cargo-cache-{{ arch }}-{{ .Branch }}-{{ checksum "Cargo.lock" }} + +jobs: + fmt: + docker: + - image: quay.io/influxdb/rust:ci + environment: + # Disable incremental compilation to avoid overhead. We are not preserving these files anyway. + CARGO_INCREMENTAL: "0" + # Disable full debug symbol generation to speed up CI build + # "1" means line tables only, which is useful for panic tracebacks. + RUSTFLAGS: "-C debuginfo=1" + # https://github.com/rust-lang/cargo/issues/10280 + CARGO_NET_GIT_FETCH_WITH_CLI: "true" + steps: + - checkout + - rust_components + - cache_restore + - run: + name: Rust fmt + command: cargo fmt --all -- --check + - cache_save + lint: + docker: + - image: quay.io/influxdb/rust:ci + environment: + # Disable incremental compilation to avoid overhead. We are not preserving these files anyway. + CARGO_INCREMENTAL: "0" + # Disable full debug symbol generation to speed up CI build + # "1" means line tables only, which is useful for panic tracebacks. + RUSTFLAGS: "-C debuginfo=1" + # https://github.com/rust-lang/cargo/issues/10280 + CARGO_NET_GIT_FETCH_WITH_CLI: "true" + steps: + - checkout + - rust_components + - cache_restore + - run: + name: Clippy + command: cargo clippy --all-targets --all-features --workspace -- -D warnings + - cache_save + cargo_audit: + docker: + - image: quay.io/influxdb/rust:ci + environment: + # Disable incremental compilation to avoid overhead. We are not preserving these files anyway. + CARGO_INCREMENTAL: "0" + # Disable full debug symbol generation to speed up CI build + # "1" means line tables only, which is useful for panic tracebacks. + RUSTFLAGS: "-C debuginfo=1" + # https://github.com/rust-lang/cargo/issues/10280 + CARGO_NET_GIT_FETCH_WITH_CLI: "true" + steps: + - checkout + - rust_components + - cache_restore + - run: + name: Install cargo-deny + command: cargo install --force cargo-deny + - run: + name: cargo-deny Checks + command: cargo deny check -s + - cache_save + check: + docker: + - image: quay.io/influxdb/rust:ci + environment: + # Disable incremental compilation to avoid overhead. We are not preserving these files anyway. + CARGO_INCREMENTAL: "0" + # Disable full debug symbol generation to speed up CI build + # "1" means line tables only, which is useful for panic tracebacks. + RUSTFLAGS: "-C debuginfo=1" + # https://github.com/rust-lang/cargo/issues/10280 + CARGO_NET_GIT_FETCH_WITH_CLI: "true" + steps: + - checkout + - rust_components + - cache_restore + - run: + name: Install cargo-hack + command: cargo install cargo-hack + - run: + name: Check all features + command: cargo hack check --feature-powerset --no-dev-deps --workspace + - cache_save + doc: + docker: + - image: quay.io/influxdb/rust:ci + environment: + # Disable incremental compilation to avoid overhead. We are not preserving these files anyway. + CARGO_INCREMENTAL: "0" + # Disable full debug symbol generation to speed up CI build + # "1" means line tables only, which is useful for panic tracebacks. + RUSTFLAGS: "-C debuginfo=1" + # https://github.com/rust-lang/cargo/issues/10280 + CARGO_NET_GIT_FETCH_WITH_CLI: "true" + steps: + - checkout + - rust_components + - cache_restore + - run: + name: Cargo doc + # excluding datafusion because it's effectively a dependency masqueraded as workspace crate. + command: cargo doc --document-private-items --no-deps --workspace --exclude datafusion + - cache_save + - run: + name: Compress Docs + command: tar -cvzf rustdoc.tar.gz target/doc/ + - store_artifacts: + path: rustdoc.tar.gz + test: + # setup multiple docker images (see https://circleci.com/docs/2.0/configuration-reference/#docker) + docker: + - image: quay.io/influxdb/rust:ci + - image: localstack/localstack:0.14.4 + - image: mcr.microsoft.com/azure-storage/azurite + - image: fsouza/fake-gcs-server + command: + - "-scheme" + - "http" + resource_class: 2xlarge # use of a smaller executor tends crashes on link + environment: + # Disable incremental compilation to avoid overhead. We are not preserving these files anyway. + CARGO_INCREMENTAL: "0" + # Disable full debug symbol generation to speed up CI build + # "1" means line tables only, which is useful for panic tracebacks. + RUSTFLAGS: "-C debuginfo=1" + # https://github.com/rust-lang/cargo/issues/10280 + CARGO_NET_GIT_FETCH_WITH_CLI: "true" + RUST_BACKTRACE: "1" + # Run integration tests + TEST_INTEGRATION: 1 + AWS_DEFAULT_REGION: "us-east-1" + AWS_ACCESS_KEY_ID: test + AWS_SECRET_ACCESS_KEY: test + AWS_ENDPOINT: http://127.0.0.1:4566 + AZURE_USE_EMULATOR: "1" + GOOGLE_SERVICE_ACCOUNT: "/tmp/gcs.json" + OBJECT_STORE_BUCKET: test-bucket + steps: + - run: + name: Setup localstack (AWS emulation) + command: | + cd /tmp + curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" + unzip awscliv2.zip + sudo ./aws/install + aws --endpoint-url=http://localhost:4566 s3 mb s3://test-bucket + - run: + name: Setup Azurite (Azure emulation) + # the magical connection string is from https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azurite?tabs=visual-studio#http-connection-strings + command: | + curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash + az storage container create -n test-bucket --connection-string 'DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;' + - run: + name: Setup fake GCS server + command: | + curl -X POST --data-binary '{"name":"test-bucket"}' -H "Content-Type: application/json" "http://localhost:4443/storage/v1/b" + echo '{"gcs_base_url": "http://localhost:4443", "disable_oauth": true, "client_email": "", "private_key": ""}' > "$GOOGLE_SERVICE_ACCOUNT" + - checkout + - rust_components + - cache_restore + - run: + name: Cargo test + command: cargo test --workspace --features=aws,azure,azure_test,gcp + - cache_save + + test_windows: + executor: + name: win/default + size: medium + environment: + # https://github.com/rust-lang/cargo/issues/10280 + CARGO_NET_GIT_FETCH_WITH_CLI: "true" + steps: + - checkout + - run: + name: Download rustup + command: wget https://win.rustup.rs/x86_64 -O rustup-init.exe + - run: + name: Install rustup + command: .\rustup-init.exe -y --default-host=x86_64-pc-windows-msvc + - run: + name: Cargo test + command: cargo test --workspace + +workflows: + version: 2 + + # CI for all pull requests. + ci: + jobs: + - check + - fmt + - lint + - cargo_audit + - test + - test_windows + - doc diff --git a/object_store_rs/.github/dependabot.yml b/object_store_rs/.github/dependabot.yml new file mode 100644 index 000000000000..656e622d3fce --- /dev/null +++ b/object_store_rs/.github/dependabot.yml @@ -0,0 +1,15 @@ +--- +version: 2 +updates: + - package-ecosystem: "cargo" + directory: "/" + schedule: + interval: daily + open-pull-requests-limit: 10 + ignore: + # Thrift version needs to match the version of the thrift-compiler used to generate code, + # and therefore this dependency requires a more manual upgrade + # + # Additionally the thrift-compiler version available in standard repos tends to lag + # the latest release significantly, and so updating to the latest version adds friction + - dependency-name: "thrift" diff --git a/object_store_rs/.github/workflows/semantic.yml b/object_store_rs/.github/workflows/semantic.yml new file mode 100644 index 000000000000..5fbae6dc995b --- /dev/null +++ b/object_store_rs/.github/workflows/semantic.yml @@ -0,0 +1,10 @@ +--- +name: "Semantic PR and Commit Messages" + +on: + pull_request: + types: [opened, reopened, synchronize, edited] + +jobs: + semantic: + uses: influxdata/validate-semantic-github-messages/.github/workflows/semantic.yml@main diff --git a/object_store_rs/.gitignore b/object_store_rs/.gitignore new file mode 100644 index 000000000000..6f38ebf95f1a --- /dev/null +++ b/object_store_rs/.gitignore @@ -0,0 +1,11 @@ +**/target +**/*.rs.bk +.idea/ +.env +.gdb_history +*.tsm +**/.DS_Store +**/.vscode +massif.out.* +valgrind-out.txt +Cargo.lock diff --git a/object_store_rs/.kodiak.toml b/object_store_rs/.kodiak.toml new file mode 100644 index 000000000000..f96092d04828 --- /dev/null +++ b/object_store_rs/.kodiak.toml @@ -0,0 +1,2 @@ +version = 1 +merge.method = "squash" diff --git a/object_store_rs/CONTRIBUTING.md b/object_store_rs/CONTRIBUTING.md new file mode 100644 index 000000000000..188b7ee7de2e --- /dev/null +++ b/object_store_rs/CONTRIBUTING.md @@ -0,0 +1,96 @@ +# Contributing + +Thank you for thinking of contributing! We very much welcome contributions from the community. To make the process +easier and more valuable for everyone involved we have a few rules and guidelines to follow. + +Anyone with a Github account is free to file issues on the project. However, if you want to contribute documentation or +code then you will need to sign InfluxData's Individual Contributor License Agreement (CLA), which can be found with +more information [on our website](https://www.influxdata.com/legal/cla/). + +## Submitting Issues and Feature Requests + +Before you file an [issue](https://github.com/influxdata/object_store_rs/issues/new), please search existing issues in +case +the same or similar issues have already been filed. If you find an existing open ticket covering your issue then please +avoid adding "👍" or "me too" comments; Github notifications can cause a lot of noise for the project maintainers who +triage the back-log. + +However, if you have a new piece of information for an existing ticket and you think it may help the investigation or +resolution, then please do add it as a comment! + +You can signal to the team that you're experiencing an existing issue with one of Github's emoji reactions (these are a +good way to add "weight" to an issue from a prioritization perspective). + +## Running Tests + +Tests can be run using `cargo` + +```shell +cargo test +``` + +## Running Integration Tests + +By default, integration tests are not run. To run them you will need to set `TEST_INTEGRATION=1` and then provide the +necessary configuration for that object store + +### AWS + +To test the S3 integration against [localstack](https://localstack.cloud/) + +First start up a container running localstack + +``` +$ podman run --rm -it -p 4566:4566 -p 4510-4559:4510-4559 localstack/localstack +``` + +Setup environment + +``` +export TEST_INTEGRATION=1 +export AWS_DEFAULT_REGION=us-east-1 +export AWS_ACCESS_KEY_ID=test +export AWS_SECRET_ACCESS_KEY=test +export AWS_ENDPOINT=http://127.0.0.1:4566 +export OBJECT_STORE_BUCKET=test-bucket +``` + +Create a bucket using the AWS CLI + +``` +podman run --net=host --env-host amazon/aws-cli --endpoint-url=http://localhost:4566 s3 mb s3://test-bucket +``` + +Run tests + +``` +$ cargo test --features aws +``` + +### Azure + +To test the Azure integration +against [azurite](https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azurite?tabs=visual-studio) + +Startup azurite + +``` +$ podman run -p 10000:10000 -p 10001:10001 -p 10002:10002 mcr.microsoft.com/azure-storage/azurite +``` + +Create a bucket + +``` +$ podman run --net=host mcr.microsoft.com/azure-cli az storage container create -n test-bucket --connection-string 'DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;' +``` + +Run tests + +``` +$ cargo test --features azure +``` + +### GCP + +We don't have a good story yet for testing the GCP integration locally. You will need to create a GCS bucket, a +service account that has access to it, and use this to run the tests. diff --git a/object_store_rs/Cargo.toml b/object_store_rs/Cargo.toml new file mode 100644 index 000000000000..762a3de96e57 --- /dev/null +++ b/object_store_rs/Cargo.toml @@ -0,0 +1,62 @@ +[package] +name = "object_store" +version = "0.3.0" +edition = "2021" +license = "MIT/Apache-2.0" +readme = "README.md" +description = "A generic object store interface" +keywords = [ + "object", + "storage", + "cloud", +] +repository = "https://github.com/influxdata/object_store_rs" + +[package.metadata.docs.rs] +all-features = true + +[dependencies] # In alphabetical order +async-trait = "0.1.53" +# Microsoft Azure Blob storage integration +azure_core = { version = "0.2", optional = true, default-features = false, features = ["enable_reqwest_rustls"] } +azure_storage = { version = "0.2", optional = true, default-features = false, features = ["account"] } +azure_storage_blobs = { version = "0.2", optional = true, default-features = false, features = ["enable_reqwest_rustls"] } +bytes = "1.0" +chrono = { version = "0.4", default-features = false, features = ["clock"] } +# Google Cloud Storage integration +futures = "0.3" +serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } +serde_json = { version = "1.0", default-features = false, optional = true } +rustls-pemfile = { version = "1.0", default-features = false, optional = true } +ring = { version = "0.16", default-features = false, features = ["std"] } +base64 = { version = "0.13", default-features = false, optional = true } +# for rusoto +hyper = { version = "0.14", optional = true, default-features = false } +# for rusoto +hyper-rustls = { version = "0.23.0", optional = true, default-features = false, features = ["webpki-tokio", "http1", "http2", "tls12"] } +itertools = "0.10.1" +percent-encoding = "2.1" +# rusoto crates are for Amazon S3 integration +rusoto_core = { version = "0.48.0", optional = true, default-features = false, features = ["rustls"] } +rusoto_credential = { version = "0.48.0", optional = true, default-features = false } +rusoto_s3 = { version = "0.48.0", optional = true, default-features = false, features = ["rustls"] } +rusoto_sts = { version = "0.48.0", optional = true, default-features = false, features = ["rustls"] } +snafu = "0.7" +tokio = { version = "1.18", features = ["sync", "macros", "parking_lot", "rt-multi-thread", "time"] } +tracing = { version = "0.1" } +reqwest = { version = "0.11", optional = true, default-features = false, features = ["rustls-tls"] } +parking_lot = { version = "0.12" } +# Filesystem integration +url = "2.2" +walkdir = "2" + +[features] +azure = ["azure_core", "azure_storage_blobs", "azure_storage", "reqwest"] +azure_test = ["azure", "azure_core/azurite_workaround", "azure_storage/azurite_workaround", "azure_storage_blobs/azurite_workaround"] +gcp = ["serde", "serde_json", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "rustls-pemfile", "base64"] +aws = ["rusoto_core", "rusoto_credential", "rusoto_s3", "rusoto_sts", "hyper", "hyper-rustls"] + +[dev-dependencies] # In alphabetical order +dotenv = "0.15.0" +tempfile = "3.1.0" +futures-test = "0.3" diff --git a/object_store_rs/LICENSE-APACHE b/object_store_rs/LICENSE-APACHE new file mode 100644 index 000000000000..16fe87b06e80 --- /dev/null +++ b/object_store_rs/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/object_store_rs/LICENSE-MIT b/object_store_rs/LICENSE-MIT new file mode 100644 index 000000000000..42d41dfd83f6 --- /dev/null +++ b/object_store_rs/LICENSE-MIT @@ -0,0 +1,25 @@ +Copyright (c) 2020 InfluxData + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/object_store_rs/README.md b/object_store_rs/README.md new file mode 100644 index 000000000000..d5745a96d271 --- /dev/null +++ b/object_store_rs/README.md @@ -0,0 +1,5 @@ +# Rust Object Store + +A crate providing a generic interface to object stores, such as S3, Azure Blob Storage and Google Cloud Storage. Originally developed for [InfluxDB IOx](https://github.com/influxdata/influxdb_iox/) and later split out for external consumption. + +See [docs.rs](https://docs.rs/object_store) for usage instructions diff --git a/object_store_rs/SECURITY.md b/object_store_rs/SECURITY.md new file mode 100644 index 000000000000..2b708f96e6dd --- /dev/null +++ b/object_store_rs/SECURITY.md @@ -0,0 +1,5 @@ +# Security Policy + +## Reporting a Vulnerability + +InfluxData takes security and our users' trust very seriously. If you believe you have found a security issue in any of our open source projects, please responsibly disclose it by contacting security@influxdata.com. More details about security vulnerability reporting, including our GPG key, can be found at https://www.influxdata.com/how-to-report-security-vulnerabilities/. diff --git a/object_store_rs/deny.toml b/object_store_rs/deny.toml new file mode 100644 index 000000000000..02324f790019 --- /dev/null +++ b/object_store_rs/deny.toml @@ -0,0 +1,28 @@ +# Configuration documentation: +#  https://embarkstudios.github.io/cargo-deny/index.html + +[advisories] +vulnerability = "deny" +yanked = "deny" +unmaintained = "warn" +notice = "warn" +ignore = [ +] +git-fetch-with-cli = true + +[licenses] +default = "allow" +unlicensed = "allow" +copyleft = "allow" + +[bans] +multiple-versions = "warn" +deny = [ + # We are using rustls as the TLS implementation, so we shouldn't be linking + # in OpenSSL too. + # + # If you're hitting this, you might want to take a look at what new + # dependencies you have introduced and check if there's a way to depend on + # rustls instead of OpenSSL (tip: check the crate's feature flags). + { name = "openssl-sys" } +] diff --git a/object_store_rs/rust-toolchain.toml b/object_store_rs/rust-toolchain.toml new file mode 100644 index 000000000000..1e0347218dc8 --- /dev/null +++ b/object_store_rs/rust-toolchain.toml @@ -0,0 +1,3 @@ +[toolchain] +channel = "1.60" +components = [ "rustfmt", "clippy" ] diff --git a/object_store_rs/rustfmt.toml b/object_store_rs/rustfmt.toml new file mode 100644 index 000000000000..7b060483beca --- /dev/null +++ b/object_store_rs/rustfmt.toml @@ -0,0 +1,6 @@ +edition = "2021" + +# Unstable features not yet supported on stable Rust +#wrap_comments = true +#imports_granularity = "Crate" +#group_imports = "StdExternalCrate" diff --git a/object_store_rs/src/aws.rs b/object_store_rs/src/aws.rs new file mode 100644 index 000000000000..57bfeaf6372d --- /dev/null +++ b/object_store_rs/src/aws.rs @@ -0,0 +1,996 @@ +//! An object store implementation for S3 +use crate::util::format_http_range; +use crate::{ + collect_bytes, + path::{Path, DELIMITER}, + util::format_prefix, + GetResult, ListResult, ObjectMeta, ObjectStore, Result, +}; +use async_trait::async_trait; +use bytes::Bytes; +use chrono::{DateTime, Utc}; +use futures::{ + stream::{self, BoxStream}, + Future, Stream, StreamExt, TryStreamExt, +}; +use hyper::client::Builder as HyperBuilder; +use rusoto_core::ByteStream; +use rusoto_credential::{InstanceMetadataProvider, StaticProvider}; +use rusoto_s3::S3; +use rusoto_sts::WebIdentityProvider; +use snafu::{OptionExt, ResultExt, Snafu}; +use std::ops::Range; +use std::{convert::TryFrom, fmt, num::NonZeroUsize, ops::Deref, sync::Arc, time::Duration}; +use tokio::sync::{OwnedSemaphorePermit, Semaphore}; +use tracing::{debug, warn}; + +/// The maximum number of times a request will be retried in the case of an AWS server error +pub const MAX_NUM_RETRIES: u32 = 3; + +/// A specialized `Error` for object store-related errors +#[derive(Debug, Snafu)] +#[allow(missing_docs)] +enum Error { + #[snafu(display("Expected streamed data to have length {}, got {}", expected, actual))] + DataDoesNotMatchLength { expected: usize, actual: usize }, + + #[snafu(display("Did not receive any data. Bucket: {}, Location: {}", bucket, path))] + NoData { bucket: String, path: String }, + + #[snafu(display( + "Unable to DELETE data. Bucket: {}, Location: {}, Error: {} ({:?})", + bucket, + path, + source, + source, + ))] + UnableToDeleteData { + source: rusoto_core::RusotoError, + bucket: String, + path: String, + }, + + #[snafu(display( + "Unable to GET data. Bucket: {}, Location: {}, Error: {} ({:?})", + bucket, + path, + source, + source, + ))] + UnableToGetData { + source: rusoto_core::RusotoError, + bucket: String, + path: String, + }, + + #[snafu(display( + "Unable to HEAD data. Bucket: {}, Location: {}, Error: {} ({:?})", + bucket, + path, + source, + source, + ))] + UnableToHeadData { + source: rusoto_core::RusotoError, + bucket: String, + path: String, + }, + + #[snafu(display( + "Unable to GET part of the data. Bucket: {}, Location: {}, Error: {} ({:?})", + bucket, + path, + source, + source, + ))] + UnableToGetPieceOfData { + source: std::io::Error, + bucket: String, + path: String, + }, + + #[snafu(display( + "Unable to PUT data. Bucket: {}, Location: {}, Error: {} ({:?})", + bucket, + path, + source, + source, + ))] + UnableToPutData { + source: rusoto_core::RusotoError, + bucket: String, + path: String, + }, + + #[snafu(display( + "Unable to list data. Bucket: {}, Error: {} ({:?})", + bucket, + source, + source, + ))] + UnableToListData { + source: rusoto_core::RusotoError, + bucket: String, + }, + + #[snafu(display( + "Unable to copy object. Bucket: {}, From: {}, To: {}, Error: {}", + bucket, + from, + to, + source, + ))] + UnableToCopyObject { + source: rusoto_core::RusotoError, + bucket: String, + from: String, + to: String, + }, + + #[snafu(display( + "Unable to parse last modified date. Bucket: {}, Error: {} ({:?})", + bucket, + source, + source, + ))] + UnableToParseLastModified { + source: chrono::ParseError, + bucket: String, + }, + + #[snafu(display( + "Unable to buffer data into temporary file, Error: {} ({:?})", + source, + source, + ))] + UnableToBufferStream { source: std::io::Error }, + + #[snafu(display( + "Could not parse `{}` as an AWS region. Regions should look like `us-east-2`. {} ({:?})", + region, + source, + source, + ))] + InvalidRegion { + region: String, + source: rusoto_core::region::ParseRegionError, + }, + + #[snafu(display("Missing aws-access-key"))] + MissingAccessKey, + + #[snafu(display("Missing aws-secret-access-key"))] + MissingSecretAccessKey, + + NotFound { + path: String, + source: Box, + }, +} + +impl From for super::Error { + fn from(source: Error) -> Self { + match source { + Error::NotFound { path, source } => Self::NotFound { path, source }, + _ => Self::Generic { + store: "S3", + source: Box::new(source), + }, + } + } +} + +/// Configuration for connecting to [Amazon S3](https://aws.amazon.com/s3/). +pub struct AmazonS3 { + /// S3 client w/o any connection limit. + /// + /// You should normally use [`Self::client`] instead. + client_unrestricted: rusoto_s3::S3Client, + + /// Semaphore that limits the usage of [`client_unrestricted`](Self::client_unrestricted). + connection_semaphore: Arc, + + /// Bucket name used by this object store client. + bucket_name: String, +} + +impl fmt::Debug for AmazonS3 { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("AmazonS3") + .field("client", &"rusoto_s3::S3Client") + .field("bucket_name", &self.bucket_name) + .finish() + } +} + +impl fmt::Display for AmazonS3 { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "AmazonS3({})", self.bucket_name) + } +} + +#[async_trait] +impl ObjectStore for AmazonS3 { + async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + let bucket_name = self.bucket_name.clone(); + let request_factory = move || { + let bytes = bytes.clone(); + + let length = bytes.len(); + let stream_data = Ok(bytes); + let stream = futures::stream::once(async move { stream_data }); + let byte_stream = ByteStream::new_with_size(stream, length); + + rusoto_s3::PutObjectRequest { + bucket: bucket_name.clone(), + key: location.to_string(), + body: Some(byte_stream), + ..Default::default() + } + }; + + let s3 = self.client().await; + + s3_request(move || { + let (s3, request_factory) = (s3.clone(), request_factory.clone()); + + async move { s3.put_object(request_factory()).await } + }) + .await + .context(UnableToPutDataSnafu { + bucket: &self.bucket_name, + path: location.as_ref(), + })?; + + Ok(()) + } + + async fn get(&self, location: &Path) -> Result { + Ok(GetResult::Stream( + self.get_object(location, None).await?.boxed(), + )) + } + + async fn get_range(&self, location: &Path, range: Range) -> Result { + let size_hint = range.end - range.start; + let stream = self.get_object(location, Some(range)).await?; + collect_bytes(stream, Some(size_hint)).await + } + + async fn head(&self, location: &Path) -> Result { + let key = location.to_string(); + let head_request = rusoto_s3::HeadObjectRequest { + bucket: self.bucket_name.clone(), + key: key.clone(), + ..Default::default() + }; + let s = self + .client() + .await + .head_object(head_request) + .await + .map_err(|e| match e { + rusoto_core::RusotoError::Service(rusoto_s3::HeadObjectError::NoSuchKey(_)) => { + Error::NotFound { + path: key.clone(), + source: e.into(), + } + } + rusoto_core::RusotoError::Unknown(h) if h.status.as_u16() == 404 => { + Error::NotFound { + path: key.clone(), + source: "resource not found".into(), + } + } + _ => Error::UnableToHeadData { + bucket: self.bucket_name.to_owned(), + path: key.clone(), + source: e, + }, + })?; + + // Note: GetObject and HeadObject return a different date format from ListObjects + // + // S3 List returns timestamps in the form + // 2013-09-17T18:07:53.000Z + // S3 GetObject returns timestamps in the form + // Last-Modified: Sun, 1 Jan 2006 12:00:00 GMT + let last_modified = match s.last_modified { + Some(lm) => DateTime::parse_from_rfc2822(&lm) + .context(UnableToParseLastModifiedSnafu { + bucket: &self.bucket_name, + })? + .with_timezone(&Utc), + None => Utc::now(), + }; + + Ok(ObjectMeta { + last_modified, + location: location.clone(), + size: usize::try_from(s.content_length.unwrap_or(0)) + .expect("unsupported size on this platform"), + }) + } + + async fn delete(&self, location: &Path) -> Result<()> { + let bucket_name = self.bucket_name.clone(); + + let request_factory = move || rusoto_s3::DeleteObjectRequest { + bucket: bucket_name.clone(), + key: location.to_string(), + ..Default::default() + }; + + let s3 = self.client().await; + + s3_request(move || { + let (s3, request_factory) = (s3.clone(), request_factory.clone()); + + async move { s3.delete_object(request_factory()).await } + }) + .await + .context(UnableToDeleteDataSnafu { + bucket: &self.bucket_name, + path: location.as_ref(), + })?; + + Ok(()) + } + + async fn list(&self, prefix: Option<&Path>) -> Result>> { + Ok(self + .list_objects_v2(prefix, None) + .await? + .map_ok(move |list_objects_v2_result| { + let contents = list_objects_v2_result.contents.unwrap_or_default(); + let iter = contents + .into_iter() + .map(|object| convert_object_meta(object, &self.bucket_name)); + + futures::stream::iter(iter) + }) + .try_flatten() + .boxed()) + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { + Ok(self + .list_objects_v2(prefix, Some(DELIMITER.to_string())) + .await? + .try_fold( + ListResult { + common_prefixes: vec![], + objects: vec![], + }, + |acc, list_objects_v2_result| async move { + let mut res = acc; + let contents = list_objects_v2_result.contents.unwrap_or_default(); + let mut objects = contents + .into_iter() + .map(|object| convert_object_meta(object, &self.bucket_name)) + .collect::>>()?; + + res.objects.append(&mut objects); + + let prefixes = list_objects_v2_result.common_prefixes.unwrap_or_default(); + res.common_prefixes.reserve(prefixes.len()); + + for p in prefixes { + let prefix = p.prefix.expect("can't have a prefix without a value"); + res.common_prefixes.push(Path::parse(prefix)?); + } + + Ok(res) + }, + ) + .await?) + } + + async fn copy(&self, from: &Path, to: &Path) -> Result<()> { + let from = from.as_ref(); + let to = to.as_ref(); + let bucket_name = self.bucket_name.clone(); + + let request_factory = move || rusoto_s3::CopyObjectRequest { + bucket: bucket_name.clone(), + copy_source: format!("{}/{}", &bucket_name, from), + key: to.to_string(), + ..Default::default() + }; + + let s3 = self.client().await; + + s3_request(move || { + let (s3, request_factory) = (s3.clone(), request_factory.clone()); + + async move { s3.copy_object(request_factory()).await } + }) + .await + .context(UnableToCopyObjectSnafu { + bucket: &self.bucket_name, + from, + to, + })?; + + Ok(()) + } + + async fn copy_if_not_exists(&self, _source: &Path, _dest: &Path) -> Result<()> { + // Will need dynamodb_lock + Err(crate::Error::NotImplemented) + } +} + +fn convert_object_meta(object: rusoto_s3::Object, bucket: &str) -> Result { + let key = object.key.expect("object doesn't exist without a key"); + let location = Path::parse(key)?; + let last_modified = match object.last_modified { + Some(lm) => DateTime::parse_from_rfc3339(&lm) + .context(UnableToParseLastModifiedSnafu { bucket })? + .with_timezone(&Utc), + None => Utc::now(), + }; + let size = + usize::try_from(object.size.unwrap_or(0)).expect("unsupported size on this platform"); + + Ok(ObjectMeta { + location, + last_modified, + size, + }) +} + +/// Configure a connection to Amazon S3 using the specified credentials in +/// the specified Amazon region and bucket. +#[allow(clippy::too_many_arguments)] +pub fn new_s3( + access_key_id: Option>, + secret_access_key: Option>, + region: impl Into, + bucket_name: impl Into, + endpoint: Option>, + session_token: Option>, + max_connections: NonZeroUsize, + allow_http: bool, +) -> Result { + let region = region.into(); + let region: rusoto_core::Region = match endpoint { + None => region.parse().context(InvalidRegionSnafu { region })?, + Some(endpoint) => rusoto_core::Region::Custom { + name: region, + endpoint: endpoint.into(), + }, + }; + + let mut builder = HyperBuilder::default(); + builder.pool_max_idle_per_host(max_connections.get()); + + let connector = if allow_http { + hyper_rustls::HttpsConnectorBuilder::new() + .with_webpki_roots() + .https_or_http() + .enable_http1() + .enable_http2() + .build() + } else { + hyper_rustls::HttpsConnectorBuilder::new() + .with_webpki_roots() + .https_only() + .enable_http1() + .enable_http2() + .build() + }; + + let http_client = rusoto_core::request::HttpClient::from_builder(builder, connector); + + let client = match (access_key_id, secret_access_key, session_token) { + (Some(access_key_id), Some(secret_access_key), Some(session_token)) => { + let credentials_provider = StaticProvider::new( + access_key_id.into(), + secret_access_key.into(), + Some(session_token.into()), + None, + ); + rusoto_s3::S3Client::new_with(http_client, credentials_provider, region) + } + (Some(access_key_id), Some(secret_access_key), None) => { + let credentials_provider = + StaticProvider::new_minimal(access_key_id.into(), secret_access_key.into()); + rusoto_s3::S3Client::new_with(http_client, credentials_provider, region) + } + (None, Some(_), _) => return Err(Error::MissingAccessKey.into()), + (Some(_), None, _) => return Err(Error::MissingSecretAccessKey.into()), + _ if std::env::var_os("AWS_WEB_IDENTITY_TOKEN_FILE").is_some() => { + rusoto_s3::S3Client::new_with(http_client, WebIdentityProvider::from_k8s_env(), region) + } + _ => rusoto_s3::S3Client::new_with(http_client, InstanceMetadataProvider::new(), region), + }; + + Ok(AmazonS3 { + client_unrestricted: client, + connection_semaphore: Arc::new(Semaphore::new(max_connections.get())), + bucket_name: bucket_name.into(), + }) +} + +/// Create a new [`AmazonS3`] that always errors +pub fn new_failing_s3() -> Result { + new_s3( + Some("foo"), + Some("bar"), + "us-east-1", + "bucket", + None as Option<&str>, + None as Option<&str>, + NonZeroUsize::new(16).unwrap(), + true, + ) +} + +/// S3 client bundled w/ a semaphore permit. +#[derive(Clone)] +struct SemaphoreClient { + /// Permit for this specific use of the client. + /// + /// Note that this field is never read and therefore considered "dead code" by rustc. + #[allow(dead_code)] + permit: Arc, + + inner: rusoto_s3::S3Client, +} + +impl Deref for SemaphoreClient { + type Target = rusoto_s3::S3Client; + + fn deref(&self) -> &Self::Target { + &self.inner + } +} + +impl AmazonS3 { + /// Get a client according to the current connection limit. + async fn client(&self) -> SemaphoreClient { + let permit = Arc::clone(&self.connection_semaphore) + .acquire_owned() + .await + .expect("semaphore shouldn't be closed yet"); + SemaphoreClient { + permit: Arc::new(permit), + inner: self.client_unrestricted.clone(), + } + } + + async fn get_object( + &self, + location: &Path, + range: Option>, + ) -> Result>> { + let key = location.to_string(); + let get_request = rusoto_s3::GetObjectRequest { + bucket: self.bucket_name.clone(), + key: key.clone(), + range: range.map(format_http_range), + ..Default::default() + }; + let bucket_name = self.bucket_name.clone(); + let stream = self + .client() + .await + .get_object(get_request) + .await + .map_err(|e| match e { + rusoto_core::RusotoError::Service(rusoto_s3::GetObjectError::NoSuchKey(_)) => { + Error::NotFound { + path: key.clone(), + source: e.into(), + } + } + _ => Error::UnableToGetData { + bucket: self.bucket_name.to_owned(), + path: key.clone(), + source: e, + }, + })? + .body + .context(NoDataSnafu { + bucket: self.bucket_name.to_owned(), + path: key.clone(), + })? + .map_err(move |source| Error::UnableToGetPieceOfData { + source, + bucket: bucket_name.clone(), + path: key.clone(), + }) + .err_into(); + + Ok(stream) + } + + async fn list_objects_v2( + &self, + prefix: Option<&Path>, + delimiter: Option, + ) -> Result>> { + enum ListState { + Start, + HasMore(String), + Done, + } + + let prefix = format_prefix(prefix); + let bucket = self.bucket_name.clone(); + + let request_factory = move || rusoto_s3::ListObjectsV2Request { + bucket, + prefix, + delimiter, + ..Default::default() + }; + let s3 = self.client().await; + + Ok(stream::unfold(ListState::Start, move |state| { + let request_factory = request_factory.clone(); + let s3 = s3.clone(); + + async move { + let continuation_token = match &state { + ListState::HasMore(continuation_token) => Some(continuation_token), + ListState::Done => { + return None; + } + // If this is the first request we've made, we don't need to make any + // modifications to the request + ListState::Start => None, + }; + + let resp = s3_request(move || { + let (s3, request_factory, continuation_token) = ( + s3.clone(), + request_factory.clone(), + continuation_token.cloned(), + ); + + async move { + s3.list_objects_v2(rusoto_s3::ListObjectsV2Request { + continuation_token, + ..request_factory() + }) + .await + } + }) + .await; + + let resp = match resp { + Ok(resp) => resp, + Err(e) => return Some((Err(e), state)), + }; + + // The AWS response contains a field named `is_truncated` as well as + // `next_continuation_token`, and we're assuming that `next_continuation_token` + // is only set when `is_truncated` is true (and therefore not + // checking `is_truncated`). + let next_state = + if let Some(next_continuation_token) = &resp.next_continuation_token { + ListState::HasMore(next_continuation_token.to_string()) + } else { + ListState::Done + }; + + Some((Ok(resp), next_state)) + } + }) + .map_err(move |e| { + Error::UnableToListData { + source: e, + bucket: self.bucket_name.clone(), + } + .into() + }) + .boxed()) + } +} + +/// Handles retrying a request to S3 up to `MAX_NUM_RETRIES` times if S3 returns 5xx server errors. +/// +/// The `future_factory` argument is a function `F` that takes no arguments and, when called, will +/// return a `Future` (type `G`) that, when `await`ed, will perform a request to S3 through +/// `rusoto` and return a `Result` that returns some type `R` on success and some +/// `rusoto_core::RusotoError` on error. +/// +/// If the executed `Future` returns success, this function will return that success. +/// If the executed `Future` returns a 5xx server error, this function will wait an amount of +/// time that increases exponentially with the number of times it has retried, get a new `Future` by +/// calling `future_factory` again, and retry the request by `await`ing the `Future` again. +/// The retries will continue until the maximum number of retries has been attempted. In that case, +/// this function will return the last encountered error. +/// +/// Client errors (4xx) will never be retried by this function. +async fn s3_request(future_factory: F) -> Result> +where + E: std::error::Error + Send, + F: Fn() -> G + Send, + G: Future>> + Send, + R: Send, +{ + let mut attempts = 0; + + loop { + let request = future_factory(); + + let result = request.await; + + match result { + Ok(r) => return Ok(r), + Err(error) => { + attempts += 1; + + let should_retry = matches!( + error, + rusoto_core::RusotoError::Unknown(ref response) + if response.status.is_server_error() + ); + + if attempts > MAX_NUM_RETRIES { + warn!( + ?error, + attempts, "maximum number of retries exceeded for AWS S3 request" + ); + return Err(error); + } else if !should_retry { + return Err(error); + } else { + debug!(?error, attempts, "retrying AWS S3 request"); + let wait_time = Duration::from_millis(2u64.pow(attempts) * 50); + tokio::time::sleep(wait_time).await; + } + } + } + } +} + +impl Error { + #[cfg(test)] + fn s3_error_due_to_credentials(&self) -> bool { + use rusoto_core::RusotoError; + use Error::*; + + matches!( + self, + UnableToPutData { + source: RusotoError::Credentials(_), + bucket: _, + path: _, + } | UnableToGetData { + source: RusotoError::Credentials(_), + bucket: _, + path: _, + } | UnableToDeleteData { + source: RusotoError::Credentials(_), + bucket: _, + path: _, + } | UnableToListData { + source: RusotoError::Credentials(_), + bucket: _, + } + ) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{ + tests::{ + get_nonexistent_object, list_uses_directories_correctly, list_with_delimiter, + put_get_delete_list, rename_and_copy, + }, + Error as ObjectStoreError, ObjectStore, + }; + use bytes::Bytes; + use std::env; + + type TestError = Box; + type Result = std::result::Result; + + const NON_EXISTENT_NAME: &str = "nonexistentname"; + + #[derive(Debug)] + struct AwsConfig { + access_key_id: String, + secret_access_key: String, + region: String, + bucket: String, + endpoint: Option, + token: Option, + } + + // Helper macro to skip tests if TEST_INTEGRATION and the AWS environment variables are not set. + macro_rules! maybe_skip_integration { + () => {{ + dotenv::dotenv().ok(); + + let required_vars = [ + "AWS_DEFAULT_REGION", + "OBJECT_STORE_BUCKET", + "AWS_ACCESS_KEY_ID", + "AWS_SECRET_ACCESS_KEY", + ]; + let unset_vars: Vec<_> = required_vars + .iter() + .filter_map(|&name| match env::var(name) { + Ok(_) => None, + Err(_) => Some(name), + }) + .collect(); + let unset_var_names = unset_vars.join(", "); + + let force = env::var("TEST_INTEGRATION"); + + if force.is_ok() && !unset_var_names.is_empty() { + panic!( + "TEST_INTEGRATION is set, \ + but variable(s) {} need to be set", + unset_var_names + ); + } else if force.is_err() { + eprintln!( + "skipping AWS integration test - set {}TEST_INTEGRATION to run", + if unset_var_names.is_empty() { + String::new() + } else { + format!("{} and ", unset_var_names) + } + ); + return; + } else { + AwsConfig { + access_key_id: env::var("AWS_ACCESS_KEY_ID") + .expect("already checked AWS_ACCESS_KEY_ID"), + secret_access_key: env::var("AWS_SECRET_ACCESS_KEY") + .expect("already checked AWS_SECRET_ACCESS_KEY"), + region: env::var("AWS_DEFAULT_REGION") + .expect("already checked AWS_DEFAULT_REGION"), + bucket: env::var("OBJECT_STORE_BUCKET") + .expect("already checked OBJECT_STORE_BUCKET"), + endpoint: env::var("AWS_ENDPOINT").ok(), + token: env::var("AWS_SESSION_TOKEN").ok(), + } + } + }}; + } + + fn check_credentials(r: Result) -> Result { + if let Err(e) = &r { + let e = &**e; + if let Some(e) = e.downcast_ref::() { + if e.s3_error_due_to_credentials() { + eprintln!( + "Try setting the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY \ + environment variables" + ); + } + } + } + + r + } + + fn make_integration(config: AwsConfig) -> AmazonS3 { + new_s3( + Some(config.access_key_id), + Some(config.secret_access_key), + config.region, + config.bucket, + config.endpoint, + config.token, + NonZeroUsize::new(16).unwrap(), + true, + ) + .expect("Valid S3 config") + } + + #[tokio::test] + async fn s3_test() { + let config = maybe_skip_integration!(); + let integration = make_integration(config); + + check_credentials(put_get_delete_list(&integration).await).unwrap(); + check_credentials(list_uses_directories_correctly(&integration).await).unwrap(); + check_credentials(list_with_delimiter(&integration).await).unwrap(); + check_credentials(rename_and_copy(&integration).await).unwrap(); + } + + #[tokio::test] + async fn s3_test_get_nonexistent_location() { + let config = maybe_skip_integration!(); + let integration = make_integration(config); + + let location = Path::from_iter([NON_EXISTENT_NAME]); + + let err = get_nonexistent_object(&integration, Some(location)) + .await + .unwrap_err(); + if let ObjectStoreError::NotFound { path, source } = err { + let source_variant = source.downcast_ref::>(); + assert!( + matches!( + source_variant, + Some(rusoto_core::RusotoError::Service( + rusoto_s3::GetObjectError::NoSuchKey(_) + )), + ), + "got: {:?}", + source_variant + ); + assert_eq!(path, NON_EXISTENT_NAME); + } else { + panic!("unexpected error type: {:?}", err); + } + } + + #[tokio::test] + async fn s3_test_get_nonexistent_bucket() { + let mut config = maybe_skip_integration!(); + config.bucket = NON_EXISTENT_NAME.into(); + let integration = make_integration(config); + + let location = Path::from_iter([NON_EXISTENT_NAME]); + + let err = integration.get(&location).await.unwrap_err().to_string(); + assert!( + err.contains("The specified bucket does not exist"), + "{}", + err + ) + } + + #[tokio::test] + async fn s3_test_put_nonexistent_bucket() { + let mut config = maybe_skip_integration!(); + config.bucket = NON_EXISTENT_NAME.into(); + let integration = make_integration(config); + + let location = Path::from_iter([NON_EXISTENT_NAME]); + let data = Bytes::from("arbitrary data"); + + let err = integration + .put(&location, data) + .await + .unwrap_err() + .to_string(); + + assert!( + err.contains("The specified bucket does not exist") + && err.contains("Unable to PUT data"), + "{}", + err + ) + } + + #[tokio::test] + async fn s3_test_delete_nonexistent_location() { + let config = maybe_skip_integration!(); + let integration = make_integration(config); + + let location = Path::from_iter([NON_EXISTENT_NAME]); + + integration.delete(&location).await.unwrap(); + } + + #[tokio::test] + async fn s3_test_delete_nonexistent_bucket() { + let mut config = maybe_skip_integration!(); + config.bucket = NON_EXISTENT_NAME.into(); + let integration = make_integration(config); + + let location = Path::from_iter([NON_EXISTENT_NAME]); + + let err = integration.delete(&location).await.unwrap_err().to_string(); + assert!( + err.contains("The specified bucket does not exist") + && err.contains("Unable to DELETE data"), + "{}", + err + ) + } +} diff --git a/object_store_rs/src/azure.rs b/object_store_rs/src/azure.rs new file mode 100644 index 000000000000..74b947d3d3b4 --- /dev/null +++ b/object_store_rs/src/azure.rs @@ -0,0 +1,613 @@ +//! An object store implementation for Azure blob storage +use crate::{ + path::{Path, DELIMITER}, + util::format_prefix, + GetResult, ListResult, ObjectMeta, ObjectStore, Result, +}; +use async_trait::async_trait; +use azure_core::{prelude::*, HttpClient}; +use azure_storage::core::prelude::{AsStorageClient, StorageAccountClient}; +use azure_storage_blobs::blob::responses::ListBlobsResponse; +use azure_storage_blobs::blob::Blob; +use azure_storage_blobs::{ + prelude::{AsBlobClient, AsContainerClient, ContainerClient}, + DeleteSnapshotsMethod, +}; +use bytes::Bytes; +use futures::{ + stream::{self, BoxStream}, + StreamExt, TryStreamExt, +}; +use snafu::{ResultExt, Snafu}; +use std::collections::BTreeSet; +use std::{convert::TryInto, sync::Arc}; + +/// A specialized `Error` for Azure object store-related errors +#[derive(Debug, Snafu)] +#[allow(missing_docs)] +enum Error { + #[snafu(display( + "Unable to DELETE data. Container: {}, Location: {}, Error: {} ({:?})", + container, + path, + source, + source, + ))] + UnableToDeleteData { + source: Box, + container: String, + path: String, + }, + + #[snafu(display( + "Unable to GET data. Container: {}, Location: {}, Error: {} ({:?})", + container, + path, + source, + source, + ))] + UnableToGetData { + source: Box, + container: String, + path: String, + }, + + #[snafu(display( + "Unable to HEAD data. Container: {}, Location: {}, Error: {} ({:?})", + container, + path, + source, + source, + ))] + UnableToHeadData { + source: Box, + container: String, + path: String, + }, + + #[snafu(display( + "Unable to GET part of the data. Container: {}, Location: {}, Error: {} ({:?})", + container, + path, + source, + source, + ))] + UnableToGetPieceOfData { + source: Box, + container: String, + path: String, + }, + + #[snafu(display( + "Unable to PUT data. Bucket: {}, Location: {}, Error: {} ({:?})", + container, + path, + source, + source, + ))] + UnableToPutData { + source: Box, + container: String, + path: String, + }, + + #[snafu(display( + "Unable to list data. Bucket: {}, Error: {} ({:?})", + container, + source, + source, + ))] + UnableToListData { + source: Box, + container: String, + }, + + #[snafu(display( + "Unable to copy object. Container: {}, From: {}, To: {}, Error: {}", + container, + from, + to, + source + ))] + UnableToCopyFile { + source: Box, + container: String, + from: String, + to: String, + }, + + #[snafu(display("Unable parse source url. Container: {}, Error: {}", container, source))] + UnableToParseUrl { + source: url::ParseError, + container: String, + }, + + NotFound { + path: String, + source: Box, + }, + + AlreadyExists { + path: String, + source: Box, + }, + + #[cfg(not(feature = "azure_test"))] + #[snafu(display( + "Azurite (azure emulator) support not compiled in, please add `azure_test` feature" + ))] + NoEmulatorFeature, +} + +impl From for super::Error { + fn from(source: Error) -> Self { + match source { + Error::NotFound { path, source } => Self::NotFound { path, source }, + Error::AlreadyExists { path, source } => Self::AlreadyExists { path, source }, + _ => Self::Generic { + store: "Azure Blob Storage", + source: Box::new(source), + }, + } + } +} + +/// Configuration for connecting to [Microsoft Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/). +#[derive(Debug)] +pub struct MicrosoftAzure { + container_client: Arc, + container_name: String, + blob_base_url: String, + is_emulator: bool, +} + +impl std::fmt::Display for MicrosoftAzure { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self.is_emulator { + true => write!(f, "MicrosoftAzureEmulator({})", self.container_name), + false => write!(f, "MicrosoftAzure({})", self.container_name), + } + } +} + +#[allow(clippy::borrowed_box)] +fn check_err_not_found(err: &Box) -> bool { + if let Some(azure_core::HttpError::StatusCode { status, .. }) = + err.downcast_ref::() + { + return status.as_u16() == 404; + }; + false +} + +#[async_trait] +impl ObjectStore for MicrosoftAzure { + async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + let bytes = bytes::BytesMut::from(&*bytes); + + self.container_client + .as_blob_client(location.as_ref()) + .put_block_blob(bytes) + .execute() + .await + .context(UnableToPutDataSnafu { + container: &self.container_name, + path: location.to_owned(), + })?; + + Ok(()) + } + + async fn get(&self, location: &Path) -> Result { + let blob = self + .container_client + .as_blob_client(location.as_ref()) + .get() + .execute() + .await + .map_err(|err| { + if check_err_not_found(&err) { + return Error::NotFound { + source: err, + path: location.to_string(), + }; + }; + Error::UnableToGetData { + source: err, + container: self.container_name.clone(), + path: location.to_string(), + } + })?; + + Ok(GetResult::Stream( + futures::stream::once(async move { Ok(blob.data) }).boxed(), + )) + } + + async fn get_range(&self, location: &Path, range: std::ops::Range) -> Result { + let blob = self + .container_client + .as_blob_client(location.as_ref()) + .get() + .range(range) + .execute() + .await + .map_err(|err| { + if check_err_not_found(&err) { + return Error::NotFound { + source: err, + path: location.to_string(), + }; + }; + Error::UnableToGetPieceOfData { + source: err, + container: self.container_name.clone(), + path: location.to_string(), + } + })?; + + Ok(blob.data) + } + + async fn head(&self, location: &Path) -> Result { + let res = self + .container_client + .as_blob_client(location.as_ref()) + .get_properties() + .execute() + .await + .map_err(|err| { + if check_err_not_found(&err) { + return Error::NotFound { + source: err, + path: location.to_string(), + }; + }; + Error::UnableToHeadData { + source: err, + container: self.container_name.clone(), + path: location.to_string(), + } + })?; + + convert_object_meta(res.blob)?.ok_or_else(|| super::Error::NotFound { + path: location.to_string(), + source: "is directory".to_string().into(), + }) + } + + async fn delete(&self, location: &Path) -> Result<()> { + self.container_client + .as_blob_client(location.as_ref()) + .delete() + .delete_snapshots_method(DeleteSnapshotsMethod::Include) + .execute() + .await + .context(UnableToDeleteDataSnafu { + container: &self.container_name, + path: location.to_string(), + })?; + + Ok(()) + } + + async fn list(&self, prefix: Option<&Path>) -> Result>> { + let stream = self + .list_impl(prefix, false) + .await? + .map_ok(|resp| { + let names = resp + .blobs + .blobs + .into_iter() + .filter_map(|blob| convert_object_meta(blob).transpose()); + futures::stream::iter(names) + }) + .try_flatten() + .boxed(); + + Ok(stream) + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { + let mut stream = self.list_impl(prefix, true).await?; + + let mut common_prefixes = BTreeSet::new(); + let mut objects = Vec::new(); + + while let Some(res) = stream.next().await { + let response = res?; + + let prefixes = response.blobs.blob_prefix.unwrap_or_default(); + for p in prefixes { + common_prefixes.insert(Path::parse(&p.name)?); + } + + let blobs = response.blobs.blobs; + objects.reserve(blobs.len()); + for blob in blobs { + if let Some(meta) = convert_object_meta(blob)? { + objects.push(meta); + } + } + } + + Ok(ListResult { + common_prefixes: common_prefixes.into_iter().collect(), + objects, + }) + } + + async fn copy(&self, from: &Path, to: &Path) -> Result<()> { + let from_url = self.get_copy_from_url(from)?; + self.container_client + .as_blob_client(to.as_ref()) + .copy(&from_url) + .execute() + .await + .context(UnableToCopyFileSnafu { + container: &self.container_name, + from: from.as_ref(), + to: to.as_ref(), + })?; + Ok(()) + } + + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + let from_url = self.get_copy_from_url(from)?; + self.container_client + .as_blob_client(to.as_ref()) + .copy(&from_url) + .if_match_condition(IfMatchCondition::NotMatch("*".to_string())) + .execute() + .await + .map_err(|err| { + if let Some(azure_core::HttpError::StatusCode { status, .. }) = + err.downcast_ref::() + { + if status.as_u16() == 409 { + return Error::AlreadyExists { + source: err, + path: to.to_string(), + }; + }; + }; + Error::UnableToCopyFile { + source: err, + container: self.container_name.clone(), + from: from.to_string(), + to: to.to_string(), + } + })?; + Ok(()) + } +} + +impl MicrosoftAzure { + /// helper function to create a source url for copy function + fn get_copy_from_url(&self, from: &Path) -> Result { + Ok(reqwest::Url::parse(&format!( + "{}/{}/{}", + &self.blob_base_url, self.container_name, from + )) + .context(UnableToParseUrlSnafu { + container: &self.container_name, + })?) + } + + async fn list_impl( + &self, + prefix: Option<&Path>, + delimiter: bool, + ) -> Result>> { + enum ListState { + Start, + HasMore(String), + Done, + } + + let prefix_raw = format_prefix(prefix); + + Ok(stream::unfold(ListState::Start, move |state| { + let mut request = self.container_client.list_blobs(); + + if let Some(p) = prefix_raw.as_deref() { + request = request.prefix(p); + } + + if delimiter { + request = request.delimiter(Delimiter::new(DELIMITER)); + } + + async move { + match state { + ListState::HasMore(ref marker) => { + request = request.next_marker(marker as &str); + } + ListState::Done => { + return None; + } + ListState::Start => {} + } + + let resp = match request.execute().await.context(UnableToListDataSnafu { + container: &self.container_name, + }) { + Ok(resp) => resp, + Err(err) => return Some((Err(crate::Error::from(err)), state)), + }; + + let next_state = if let Some(marker) = &resp.next_marker { + ListState::HasMore(marker.as_str().to_string()) + } else { + ListState::Done + }; + + Some((Ok(resp), next_state)) + } + }) + .boxed()) + } +} + +/// Returns `None` if is a directory +fn convert_object_meta(blob: Blob) -> Result> { + let location = Path::parse(blob.name)?; + let last_modified = blob.properties.last_modified; + let size = blob + .properties + .content_length + .try_into() + .expect("unsupported size on this platform"); + + // This is needed to filter out gen2 directories + // https://docs.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-known-issues#blob-storage-apis + Ok((size > 0).then(|| ObjectMeta { + location, + last_modified, + size, + })) +} + +#[cfg(feature = "azure_test")] +fn check_if_emulator_works() -> Result<()> { + Ok(()) +} + +#[cfg(not(feature = "azure_test"))] +fn check_if_emulator_works() -> Result<()> { + Err(Error::NoEmulatorFeature.into()) +} + +/// Configure a connection to container with given name on Microsoft Azure +/// Blob store. +/// +/// The credentials `account` and `access_key` must provide access to the +/// store. +pub fn new_azure( + account: impl Into, + access_key: impl Into, + container_name: impl Into, + use_emulator: bool, +) -> Result { + let account = account.into(); + let access_key = access_key.into(); + let http_client: Arc = Arc::new(reqwest::Client::new()); + + let (is_emulator, storage_account_client) = if use_emulator { + check_if_emulator_works()?; + (true, StorageAccountClient::new_emulator_default()) + } else { + ( + false, + StorageAccountClient::new_access_key(Arc::clone(&http_client), &account, &access_key), + ) + }; + + let storage_client = storage_account_client.as_storage_client(); + let blob_base_url = storage_account_client + .blob_storage_url() + .as_ref() + // make url ending consistent between the emulator and remote storage account + .trim_end_matches('/') + .to_string(); + + let container_name = container_name.into(); + + let container_client = storage_client.as_container_client(&container_name); + + Ok(MicrosoftAzure { + container_client, + container_name, + blob_base_url, + is_emulator, + }) +} + +#[cfg(test)] +mod tests { + use crate::azure::new_azure; + use crate::tests::{ + copy_if_not_exists, list_uses_directories_correctly, list_with_delimiter, + put_get_delete_list, rename_and_copy, + }; + use std::env; + + #[derive(Debug)] + struct AzureConfig { + storage_account: String, + access_key: String, + bucket: String, + use_emulator: bool, + } + + // Helper macro to skip tests if TEST_INTEGRATION and the Azure environment + // variables are not set. + macro_rules! maybe_skip_integration { + () => {{ + dotenv::dotenv().ok(); + + let use_emulator = std::env::var("AZURE_USE_EMULATOR").is_ok(); + + let mut required_vars = vec!["OBJECT_STORE_BUCKET"]; + if !use_emulator { + required_vars.push("AZURE_STORAGE_ACCOUNT"); + required_vars.push("AZURE_STORAGE_ACCESS_KEY"); + } + let unset_vars: Vec<_> = required_vars + .iter() + .filter_map(|&name| match env::var(name) { + Ok(_) => None, + Err(_) => Some(name), + }) + .collect(); + let unset_var_names = unset_vars.join(", "); + + let force = std::env::var("TEST_INTEGRATION"); + + if force.is_ok() && !unset_var_names.is_empty() { + panic!( + "TEST_INTEGRATION is set, \ + but variable(s) {} need to be set", + unset_var_names + ) + } else if force.is_err() { + eprintln!( + "skipping Azure integration test - set {}TEST_INTEGRATION to run", + if unset_var_names.is_empty() { + String::new() + } else { + format!("{} and ", unset_var_names) + } + ); + return; + } else { + AzureConfig { + storage_account: env::var("AZURE_STORAGE_ACCOUNT").unwrap_or_default(), + access_key: env::var("AZURE_STORAGE_ACCESS_KEY").unwrap_or_default(), + bucket: env::var("OBJECT_STORE_BUCKET") + .expect("already checked OBJECT_STORE_BUCKET"), + use_emulator, + } + } + }}; + } + + #[tokio::test] + async fn azure_blob_test() { + let config = maybe_skip_integration!(); + let integration = new_azure( + config.storage_account, + config.access_key, + config.bucket, + config.use_emulator, + ) + .unwrap(); + + put_get_delete_list(&integration).await.unwrap(); + list_uses_directories_correctly(&integration).await.unwrap(); + list_with_delimiter(&integration).await.unwrap(); + rename_and_copy(&integration).await.unwrap(); + copy_if_not_exists(&integration).await.unwrap(); + } +} diff --git a/object_store_rs/src/gcp.rs b/object_store_rs/src/gcp.rs new file mode 100644 index 000000000000..b36838043273 --- /dev/null +++ b/object_store_rs/src/gcp.rs @@ -0,0 +1,682 @@ +//! An object store implementation for Google Cloud Storage +use std::collections::BTreeSet; +use std::fs::File; +use std::io::BufReader; +use std::ops::Range; + +use async_trait::async_trait; +use bytes::Bytes; +use chrono::{DateTime, Utc}; +use futures::{stream::BoxStream, StreamExt, TryStreamExt}; +use percent_encoding::{percent_encode, NON_ALPHANUMERIC}; +use reqwest::header::RANGE; +use reqwest::{header, Client, Method, Response, StatusCode}; +use snafu::{ResultExt, Snafu}; + +use crate::util::format_http_range; +use crate::{ + oauth::OAuthProvider, + path::{Path, DELIMITER}, + token::TokenCache, + util::format_prefix, + GetResult, ListResult, ObjectMeta, ObjectStore, Result, +}; + +#[derive(Debug, Snafu)] +enum Error { + #[snafu(display("Unable to open service account file: {}", source))] + OpenCredentials { source: std::io::Error }, + + #[snafu(display("Unable to decode service account file: {}", source))] + DecodeCredentials { source: serde_json::Error }, + + #[snafu(display("Error performing list request: {}", source))] + ListRequest { source: reqwest::Error }, + + #[snafu(display("Error performing get request {}: {}", path, source))] + GetRequest { + source: reqwest::Error, + path: String, + }, + + #[snafu(display("Error performing delete request {}: {}", path, source))] + DeleteRequest { + source: reqwest::Error, + path: String, + }, + + #[snafu(display("Error performing copy request {}: {}", path, source))] + CopyRequest { + source: reqwest::Error, + path: String, + }, + + #[snafu(display("Error performing put request: {}", source))] + PutRequest { source: reqwest::Error }, + + #[snafu(display("Error decoding object size: {}", source))] + InvalidSize { source: std::num::ParseIntError }, +} + +impl From for super::Error { + fn from(err: Error) -> Self { + match err { + Error::GetRequest { source, path } + | Error::DeleteRequest { source, path } + | Error::CopyRequest { source, path } + if matches!(source.status(), Some(StatusCode::NOT_FOUND)) => + { + Self::NotFound { + path, + source: Box::new(source), + } + } + _ => Self::Generic { + store: "GCS", + source: Box::new(err), + }, + } + } +} + +/// A deserialized `service-account-********.json`-file. +#[derive(serde::Deserialize, Debug)] +struct ServiceAccountCredentials { + /// The private key in RSA format. + pub private_key: String, + + /// The email address associated with the service account. + pub client_email: String, + + /// Base URL for GCS + #[serde(default = "default_gcs_base_url")] + pub gcs_base_url: String, + + /// Disable oauth and use empty tokens. + #[serde(default = "default_disable_oauth")] + pub disable_oauth: bool, +} + +fn default_gcs_base_url() -> String { + "https://storage.googleapis.com".to_owned() +} + +fn default_disable_oauth() -> bool { + false +} + +#[derive(serde::Deserialize, Debug)] +#[serde(rename_all = "camelCase")] +struct ListResponse { + next_page_token: Option, + #[serde(default)] + prefixes: Vec, + #[serde(default)] + items: Vec, +} + +#[derive(serde::Deserialize, Debug)] +struct Object { + name: String, + size: String, + updated: DateTime, +} + +/// Configuration for connecting to [Google Cloud Storage](https://cloud.google.com/storage/). +#[derive(Debug)] +pub struct GoogleCloudStorage { + client: Client, + base_url: String, + + oauth_provider: Option, + token_cache: TokenCache, + + bucket_name: String, + bucket_name_encoded: String, + + // TODO: Hook this up in tests + max_list_results: Option, +} + +impl std::fmt::Display for GoogleCloudStorage { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "GoogleCloudStorage({})", self.bucket_name) + } +} + +impl GoogleCloudStorage { + async fn get_token(&self) -> Result { + if let Some(oauth_provider) = &self.oauth_provider { + Ok(self + .token_cache + .get_or_insert_with(|| oauth_provider.fetch_token(&self.client)) + .await?) + } else { + Ok("".to_owned()) + } + } + + fn object_url(&self, path: &Path) -> String { + let encoded = percent_encoding::utf8_percent_encode(path.as_ref(), NON_ALPHANUMERIC); + format!( + "{}/storage/v1/b/{}/o/{}", + self.base_url, self.bucket_name_encoded, encoded + ) + } + + /// Perform a get request + async fn get_request( + &self, + path: &Path, + range: Option>, + head: bool, + ) -> Result { + let token = self.get_token().await?; + let url = self.object_url(path); + + let mut builder = self.client.request(Method::GET, url); + + if let Some(range) = range { + builder = builder.header(RANGE, format_http_range(range)); + } + + let alt = match head { + true => "json", + false => "media", + }; + + let response = builder + .bearer_auth(token) + .query(&[("alt", alt)]) + .send() + .await + .context(GetRequestSnafu { + path: path.as_ref(), + })? + .error_for_status() + .context(GetRequestSnafu { + path: path.as_ref(), + })?; + + Ok(response) + } + + /// Perform a put request + async fn put_request(&self, path: &Path, payload: Bytes) -> Result<()> { + let token = self.get_token().await?; + let url = format!( + "{}/upload/storage/v1/b/{}/o", + self.base_url, self.bucket_name_encoded + ); + + self.client + .request(Method::POST, url) + .bearer_auth(token) + .header(header::CONTENT_TYPE, "application/octet-stream") + .header(header::CONTENT_LENGTH, payload.len()) + .query(&[("uploadType", "media"), ("name", path.as_ref())]) + .body(payload) + .send() + .await + .context(PutRequestSnafu)? + .error_for_status() + .context(PutRequestSnafu)?; + + Ok(()) + } + + /// Perform a delete request + async fn delete_request(&self, path: &Path) -> Result<()> { + let token = self.get_token().await?; + let url = self.object_url(path); + + let builder = self.client.request(Method::DELETE, url); + builder + .bearer_auth(token) + .send() + .await + .context(DeleteRequestSnafu { + path: path.as_ref(), + })? + .error_for_status() + .context(DeleteRequestSnafu { + path: path.as_ref(), + })?; + + Ok(()) + } + + /// Perform a copy request + async fn copy_request(&self, from: &Path, to: &Path, if_not_exists: bool) -> Result<()> { + let token = self.get_token().await?; + + let source = percent_encoding::utf8_percent_encode(from.as_ref(), NON_ALPHANUMERIC); + let destination = percent_encoding::utf8_percent_encode(to.as_ref(), NON_ALPHANUMERIC); + let url = format!( + "{}/storage/v1/b/{}/o/{}/copyTo/b/{}/o/{}", + self.base_url, self.bucket_name_encoded, source, self.bucket_name_encoded, destination + ); + + let mut builder = self.client.request(Method::POST, url); + + if if_not_exists { + builder = builder.query(&[("ifGenerationMatch", "0")]); + } + + builder + .bearer_auth(token) + .send() + .await + .context(CopyRequestSnafu { + path: from.as_ref(), + })? + .error_for_status() + .context(CopyRequestSnafu { + path: from.as_ref(), + })?; + + Ok(()) + } + + /// Perform a list request + async fn list_request( + &self, + prefix: Option<&str>, + delimiter: bool, + page_token: Option<&str>, + ) -> Result { + let token = self.get_token().await?; + + let url = format!( + "{}/storage/v1/b/{}/o", + self.base_url, self.bucket_name_encoded + ); + + let mut query = Vec::with_capacity(4); + if delimiter { + query.push(("delimiter", DELIMITER)) + } + + if let Some(prefix) = &prefix { + query.push(("prefix", prefix)) + } + + if let Some(page_token) = page_token { + query.push(("pageToken", page_token)) + } + + if let Some(max_results) = &self.max_list_results { + query.push(("maxResults", max_results)) + } + + let response: ListResponse = self + .client + .request(Method::GET, url) + .query(&query) + .bearer_auth(token) + .send() + .await + .context(ListRequestSnafu)? + .error_for_status() + .context(ListRequestSnafu)? + .json() + .await + .context(ListRequestSnafu)?; + + Ok(response) + } + + /// Perform a list operation automatically handling pagination + fn list_paginated( + &self, + prefix: Option<&Path>, + delimiter: bool, + ) -> Result>> { + let prefix = format_prefix(prefix); + + enum ListState { + Start, + HasMore(String), + Done, + } + + Ok(futures::stream::unfold(ListState::Start, move |state| { + let prefix = prefix.clone(); + + async move { + let page_token = match &state { + ListState::Start => None, + ListState::HasMore(page_token) => Some(page_token.as_str()), + ListState::Done => { + return None; + } + }; + + let resp = match self + .list_request(prefix.as_deref(), delimiter, page_token) + .await + { + Ok(resp) => resp, + Err(e) => return Some((Err(e), state)), + }; + + let next_state = match &resp.next_page_token { + Some(token) => ListState::HasMore(token.clone()), + None => ListState::Done, + }; + + Some((Ok(resp), next_state)) + } + }) + .boxed()) + } +} + +#[async_trait] +impl ObjectStore for GoogleCloudStorage { + async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + self.put_request(location, bytes).await + } + + async fn get(&self, location: &Path) -> Result { + let response = self.get_request(location, None, false).await?; + let stream = response + .bytes_stream() + .map_err(|source| crate::Error::Generic { + store: "GCS", + source: Box::new(source), + }) + .boxed(); + + Ok(GetResult::Stream(stream)) + } + + async fn get_range(&self, location: &Path, range: Range) -> Result { + let response = self.get_request(location, Some(range), false).await?; + Ok(response.bytes().await.context(GetRequestSnafu { + path: location.as_ref(), + })?) + } + + async fn head(&self, location: &Path) -> Result { + let response = self.get_request(location, None, true).await?; + let object = response.json().await.context(GetRequestSnafu { + path: location.as_ref(), + })?; + convert_object_meta(&object) + } + + async fn delete(&self, location: &Path) -> Result<()> { + self.delete_request(location).await + } + + async fn list(&self, prefix: Option<&Path>) -> Result>> { + let stream = self + .list_paginated(prefix, false)? + .map_ok(|r| futures::stream::iter(r.items.into_iter().map(|x| convert_object_meta(&x)))) + .try_flatten() + .boxed(); + + Ok(stream) + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { + let mut stream = self.list_paginated(prefix, true)?; + + let mut common_prefixes = BTreeSet::new(); + let mut objects = Vec::new(); + + while let Some(result) = stream.next().await { + let response = result?; + + for p in response.prefixes { + common_prefixes.insert(Path::parse(p)?); + } + + objects.reserve(response.items.len()); + for object in &response.items { + objects.push(convert_object_meta(object)?); + } + } + + Ok(ListResult { + common_prefixes: common_prefixes.into_iter().collect(), + objects, + }) + } + + async fn copy(&self, from: &Path, to: &Path) -> Result<()> { + self.copy_request(from, to, false).await + } + + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + self.copy_request(from, to, true).await + } +} + +fn reader_credentials_file( + service_account_path: impl AsRef, +) -> Result { + let file = File::open(service_account_path).context(OpenCredentialsSnafu)?; + let reader = BufReader::new(file); + Ok(serde_json::from_reader(reader).context(DecodeCredentialsSnafu)?) +} + +/// Configure a connection to Google Cloud Storage. +pub fn new_gcs( + service_account_path: impl AsRef, + bucket_name: impl Into, +) -> Result { + let credentials = reader_credentials_file(service_account_path)?; + let client = Client::new(); + + // TODO: https://cloud.google.com/storage/docs/authentication#oauth-scopes + let scope = "https://www.googleapis.com/auth/devstorage.full_control"; + let audience = "https://www.googleapis.com/oauth2/v4/token".to_string(); + + let oauth_provider = (!credentials.disable_oauth) + .then(|| { + OAuthProvider::new( + credentials.client_email, + credentials.private_key, + scope.to_string(), + audience, + ) + }) + .transpose()?; + + let bucket_name = bucket_name.into(); + let encoded_bucket_name = percent_encode(bucket_name.as_bytes(), NON_ALPHANUMERIC).to_string(); + + // The cloud storage crate currently only supports authentication via + // environment variables. Set the environment variable explicitly so + // that we can optionally accept command line arguments instead. + Ok(GoogleCloudStorage { + client, + base_url: credentials.gcs_base_url, + oauth_provider, + token_cache: Default::default(), + bucket_name, + bucket_name_encoded: encoded_bucket_name, + max_list_results: None, + }) +} + +fn convert_object_meta(object: &Object) -> Result { + let location = Path::parse(&object.name)?; + let last_modified = object.updated; + let size = object.size.parse().context(InvalidSizeSnafu)?; + + Ok(ObjectMeta { + location, + last_modified, + size, + }) +} + +#[cfg(test)] +mod test { + use std::env; + + use bytes::Bytes; + + use crate::{ + tests::{ + get_nonexistent_object, list_uses_directories_correctly, list_with_delimiter, + put_get_delete_list, rename_and_copy, + }, + Error as ObjectStoreError, ObjectStore, + }; + + use super::*; + + const NON_EXISTENT_NAME: &str = "nonexistentname"; + + #[derive(Debug)] + struct GoogleCloudConfig { + bucket: String, + service_account: String, + } + + // Helper macro to skip tests if TEST_INTEGRATION and the GCP environment variables are not set. + macro_rules! maybe_skip_integration { + () => {{ + dotenv::dotenv().ok(); + + let required_vars = ["OBJECT_STORE_BUCKET", "GOOGLE_SERVICE_ACCOUNT"]; + let unset_vars: Vec<_> = required_vars + .iter() + .filter_map(|&name| match env::var(name) { + Ok(_) => None, + Err(_) => Some(name), + }) + .collect(); + let unset_var_names = unset_vars.join(", "); + + let force = std::env::var("TEST_INTEGRATION"); + + if force.is_ok() && !unset_var_names.is_empty() { + panic!( + "TEST_INTEGRATION is set, \ + but variable(s) {} need to be set", + unset_var_names + ) + } else if force.is_err() { + eprintln!( + "skipping Google Cloud integration test - set {}TEST_INTEGRATION to run", + if unset_var_names.is_empty() { + String::new() + } else { + format!("{} and ", unset_var_names) + } + ); + return; + } else { + GoogleCloudConfig { + bucket: env::var("OBJECT_STORE_BUCKET") + .expect("already checked OBJECT_STORE_BUCKET"), + service_account: env::var("GOOGLE_SERVICE_ACCOUNT") + .expect("already checked GOOGLE_SERVICE_ACCOUNT"), + } + } + }}; + } + + #[tokio::test] + async fn gcs_test() { + let config = maybe_skip_integration!(); + let integration = new_gcs(config.service_account, config.bucket).unwrap(); + + put_get_delete_list(&integration).await.unwrap(); + list_uses_directories_correctly(&integration).await.unwrap(); + list_with_delimiter(&integration).await.unwrap(); + rename_and_copy(&integration).await.unwrap(); + } + + #[tokio::test] + async fn gcs_test_get_nonexistent_location() { + let config = maybe_skip_integration!(); + let integration = new_gcs(config.service_account, &config.bucket).unwrap(); + + let location = Path::from_iter([NON_EXISTENT_NAME]); + + let err = integration.get(&location).await.unwrap_err(); + + assert!( + matches!(err, ObjectStoreError::NotFound { .. }), + "unexpected error type: {}", + err + ); + } + + #[tokio::test] + async fn gcs_test_get_nonexistent_bucket() { + let mut config = maybe_skip_integration!(); + config.bucket = NON_EXISTENT_NAME.into(); + let integration = new_gcs(config.service_account, &config.bucket).unwrap(); + + let location = Path::from_iter([NON_EXISTENT_NAME]); + + let err = get_nonexistent_object(&integration, Some(location)) + .await + .unwrap_err(); + + assert!( + matches!(err, ObjectStoreError::NotFound { .. }), + "unexpected error type: {}", + err + ); + } + + #[tokio::test] + async fn gcs_test_delete_nonexistent_location() { + let config = maybe_skip_integration!(); + let integration = new_gcs(config.service_account, &config.bucket).unwrap(); + + let location = Path::from_iter([NON_EXISTENT_NAME]); + + let err = integration.delete(&location).await.unwrap_err(); + assert!( + matches!(err, ObjectStoreError::NotFound { .. }), + "unexpected error type: {}", + err + ); + } + + #[tokio::test] + async fn gcs_test_delete_nonexistent_bucket() { + let mut config = maybe_skip_integration!(); + config.bucket = NON_EXISTENT_NAME.into(); + let integration = new_gcs(config.service_account, &config.bucket).unwrap(); + + let location = Path::from_iter([NON_EXISTENT_NAME]); + + let err = integration.delete(&location).await.unwrap_err(); + assert!( + matches!(err, ObjectStoreError::NotFound { .. }), + "unexpected error type: {}", + err + ); + } + + #[tokio::test] + async fn gcs_test_put_nonexistent_bucket() { + let mut config = maybe_skip_integration!(); + config.bucket = NON_EXISTENT_NAME.into(); + let integration = new_gcs(config.service_account, &config.bucket).unwrap(); + + let location = Path::from_iter([NON_EXISTENT_NAME]); + let data = Bytes::from("arbitrary data"); + + let err = integration + .put(&location, data) + .await + .unwrap_err() + .to_string(); + assert!( + err.contains("Error performing put request: HTTP status client error (404 Not Found)"), + "{}", + err + ) + } +} diff --git a/object_store_rs/src/lib.rs b/object_store_rs/src/lib.rs new file mode 100644 index 000000000000..f95eccdc6945 --- /dev/null +++ b/object_store_rs/src/lib.rs @@ -0,0 +1,678 @@ +#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)] +#![warn( + missing_copy_implementations, + missing_debug_implementations, + missing_docs, + clippy::explicit_iter_loop, + clippy::future_not_send, + clippy::use_self, + clippy::clone_on_ref_ptr +)] + +//! # object_store +//! +//! This crate provides APIs for interacting with object storage services. +//! +//! It currently supports PUT, GET, DELETE, HEAD and list for: +//! +//! * [Google Cloud Storage](https://cloud.google.com/storage/) +//! * [Amazon S3](https://aws.amazon.com/s3/) +//! * [Azure Blob Storage](https://azure.microsoft.com/en-gb/services/storage/blobs/#overview) +//! * In-memory +//! * Local file storage +//! + +#[cfg(feature = "aws")] +pub mod aws; +#[cfg(feature = "azure")] +pub mod azure; +#[cfg(feature = "gcp")] +pub mod gcp; +pub mod local; +pub mod memory; +pub mod path; +pub mod throttle; + +#[cfg(feature = "gcp")] +mod oauth; + +#[cfg(feature = "gcp")] +mod token; + +mod util; + +use crate::path::Path; +use crate::util::{collect_bytes, maybe_spawn_blocking}; +use async_trait::async_trait; +use bytes::Bytes; +use chrono::{DateTime, Utc}; +use futures::{stream::BoxStream, StreamExt}; +use snafu::Snafu; +use std::fmt::{Debug, Formatter}; +use std::io::{Read, Seek, SeekFrom}; +use std::ops::Range; + +/// An alias for a dynamically dispatched object store implementation. +pub type DynObjectStore = dyn ObjectStore; + +/// Universal API to multiple object store services. +#[async_trait] +pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { + /// Save the provided bytes to the specified location. + async fn put(&self, location: &Path, bytes: Bytes) -> Result<()>; + + /// Return the bytes that are stored at the specified location. + async fn get(&self, location: &Path) -> Result; + + /// Return the bytes that are stored at the specified location + /// in the given byte range + async fn get_range(&self, location: &Path, range: Range) -> Result; + + /// Return the metadata for the specified location + async fn head(&self, location: &Path) -> Result; + + /// Delete the object at the specified location. + async fn delete(&self, location: &Path) -> Result<()>; + + /// List all the objects with the given prefix. + /// + /// Prefixes are evaluated on a path segment basis, i.e. `foo/bar/` is a prefix of `foo/bar/x` but not of + /// `foo/bar_baz/x`. + async fn list(&self, prefix: Option<&Path>) -> Result>>; + + /// List objects with the given prefix and an implementation specific + /// delimiter. Returns common prefixes (directories) in addition to object + /// metadata. + /// + /// Prefixes are evaluated on a path segment basis, i.e. `foo/bar/` is a prefix of `foo/bar/x` but not of + /// `foo/bar_baz/x`. + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result; + + /// Copy an object from one path to another in the same object store. + /// + /// If there exists an object at the destination, it will be overwritten. + async fn copy(&self, from: &Path, to: &Path) -> Result<()>; + + /// Move an object from one path to another in the same object store. + /// + /// By default, this is implemented as a copy and then delete source. It may not + /// check when deleting source that it was the same object that was originally copied. + /// + /// If there exists an object at the destination, it will be overwritten. + async fn rename(&self, from: &Path, to: &Path) -> Result<()> { + self.copy(from, to).await?; + self.delete(from).await + } + + /// Copy an object from one path to another, only if destination is empty. + /// + /// Will return an error if the destination already has an object. + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()>; + + /// Move an object from one path to another in the same object store. + /// + /// Will return an error if the destination already has an object. + async fn rename_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + self.copy_if_not_exists(from, to).await?; + self.delete(from).await + } +} + +/// Result of a list call that includes objects, prefixes (directories) and a +/// token for the next set of results. Individual result sets may be limited to +/// 1,000 objects based on the underlying object storage's limitations. +#[derive(Debug)] +pub struct ListResult { + /// Prefixes that are common (like directories) + pub common_prefixes: Vec, + /// Object metadata for the listing + pub objects: Vec, +} + +/// The metadata that describes an object. +#[derive(Debug, Clone, PartialEq)] +pub struct ObjectMeta { + /// The full path to the object + pub location: Path, + /// The last modified time + pub last_modified: DateTime, + /// The size in bytes of the object + pub size: usize, +} + +/// Result for a get request +/// +/// This special cases the case of a local file, as some systems may +/// be able to optimise the case of a file already present on local disk +pub enum GetResult { + /// A file and its path on the local filesystem + File(std::fs::File, std::path::PathBuf), + /// An asynchronous stream + Stream(BoxStream<'static, Result>), +} + +impl Debug for GetResult { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Self::File(_, _) => write!(f, "GetResult(File)"), + Self::Stream(_) => write!(f, "GetResult(Stream)"), + } + } +} + +impl GetResult { + /// Collects the data into a [`Bytes`] + pub async fn bytes(self) -> Result { + match self { + Self::File(mut file, path) => { + maybe_spawn_blocking(move || { + let len = file + .seek(SeekFrom::End(0)) + .map_err(|source| local::Error::Seek { + source, + path: path.clone(), + })?; + + file.seek(SeekFrom::Start(0)) + .map_err(|source| local::Error::Seek { + source, + path: path.clone(), + })?; + + let mut buffer = Vec::with_capacity(len as usize); + file.read_to_end(&mut buffer) + .map_err(|source| local::Error::UnableToReadBytes { source, path })?; + + Ok(buffer.into()) + }) + .await + } + Self::Stream(s) => collect_bytes(s, None).await, + } + } + + /// Converts this into a byte stream + /// + /// If the result is [`Self::File`] will perform chunked reads of the file, otherwise + /// will return the [`Self::Stream`]. + /// + /// # Tokio Compatibility + /// + /// Tokio discourages performing blocking IO on a tokio worker thread, however, + /// no major operating systems have stable async file APIs. Therefore if called from + /// a tokio context, this will use [`tokio::runtime::Handle::spawn_blocking`] to dispatch + /// IO to a blocking thread pool, much like `tokio::fs` does under-the-hood. + /// + /// If not called from a tokio context, this will perform IO on the current thread with + /// no additional complexity or overheads + pub fn into_stream(self) -> BoxStream<'static, Result> { + match self { + Self::File(file, path) => { + const CHUNK_SIZE: usize = 8 * 1024; + + futures::stream::try_unfold((file, path, false), |(mut file, path, finished)| { + maybe_spawn_blocking(move || { + if finished { + return Ok(None); + } + + let mut buffer = Vec::with_capacity(CHUNK_SIZE); + let read = file + .by_ref() + .take(CHUNK_SIZE as u64) + .read_to_end(&mut buffer) + .map_err(|e| local::Error::UnableToReadBytes { + source: e, + path: path.clone(), + })?; + + Ok(Some((buffer.into(), (file, path, read != CHUNK_SIZE)))) + }) + }) + .boxed() + } + Self::Stream(s) => s, + } + } +} + +/// A specialized `Result` for object store-related errors +pub type Result = std::result::Result; + +/// A specialized `Error` for object store-related errors +#[derive(Debug, Snafu)] +#[allow(missing_docs)] +pub enum Error { + #[snafu(display("Generic {} error: {}", store, source))] + Generic { + store: &'static str, + source: Box, + }, + + #[snafu(display("Object at location {} not found: {}", path, source))] + NotFound { + path: String, + source: Box, + }, + + #[snafu( + display("Encountered object with invalid path: {}", source), + context(false) + )] + InvalidPath { source: path::Error }, + + #[snafu(display("Error joining spawned task: {}", source), context(false))] + JoinError { source: tokio::task::JoinError }, + + #[snafu(display("Operation not supported: {}", source))] + NotSupported { + source: Box, + }, + + #[snafu(display("Object at location {} already exists: {}", path, source))] + AlreadyExists { + path: String, + source: Box, + }, + + #[snafu(display("Operation not yet implemented."))] + NotImplemented, + + #[cfg(feature = "gcp")] + #[snafu(display("OAuth error: {}", source), context(false))] + OAuth { source: oauth::Error }, +} + +#[cfg(test)] +mod test_util { + use super::*; + use futures::TryStreamExt; + + pub async fn flatten_list_stream( + storage: &DynObjectStore, + prefix: Option<&Path>, + ) -> Result> { + storage + .list(prefix) + .await? + .map_ok(|meta| meta.location) + .try_collect::>() + .await + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::test_util::flatten_list_stream; + + type Error = Box; + type Result = std::result::Result; + + pub(crate) async fn put_get_delete_list(storage: &DynObjectStore) -> Result<()> { + let store_str = storage.to_string(); + + delete_fixtures(storage).await; + + let content_list = flatten_list_stream(storage, None).await?; + assert!( + content_list.is_empty(), + "Expected list to be empty; found: {:?}", + content_list + ); + + let location = Path::from("test_dir/test_file.json"); + + let data = Bytes::from("arbitrary data"); + let expected_data = data.clone(); + storage.put(&location, data).await?; + + let root = Path::from("/"); + + // List everything + let content_list = flatten_list_stream(storage, None).await?; + assert_eq!(content_list, &[location.clone()]); + + // Should behave the same as no prefix + let content_list = flatten_list_stream(storage, Some(&root)).await?; + assert_eq!(content_list, &[location.clone()]); + + // List with delimiter + let result = storage.list_with_delimiter(None).await.unwrap(); + assert_eq!(&result.objects, &[]); + assert_eq!(result.common_prefixes.len(), 1); + assert_eq!(result.common_prefixes[0], Path::from("test_dir")); + + // Should behave the same as no prefix + let result = storage.list_with_delimiter(Some(&root)).await.unwrap(); + assert!(result.objects.is_empty()); + assert_eq!(result.common_prefixes.len(), 1); + assert_eq!(result.common_prefixes[0], Path::from("test_dir")); + + // List everything starting with a prefix that should return results + let prefix = Path::from("test_dir"); + let content_list = flatten_list_stream(storage, Some(&prefix)).await?; + assert_eq!(content_list, &[location.clone()]); + + // List everything starting with a prefix that shouldn't return results + let prefix = Path::from("something"); + let content_list = flatten_list_stream(storage, Some(&prefix)).await?; + assert!(content_list.is_empty()); + + let read_data = storage.get(&location).await?.bytes().await?; + assert_eq!(&*read_data, expected_data); + + // Test range request + let range = 3..7; + let range_result = storage.get_range(&location, range.clone()).await; + + let out_of_range = 200..300; + let out_of_range_result = storage.get_range(&location, out_of_range).await; + + if store_str.starts_with("MicrosoftAzureEmulator") { + // Azurite doesn't support x-ms-range-get-content-crc64 set by Azure SDK + // https://github.com/Azure/Azurite/issues/444 + let err = range_result.unwrap_err().to_string(); + assert!(err.contains("x-ms-range-get-content-crc64 header or parameter is not supported in Azurite strict mode"), "{}", err); + + let err = out_of_range_result.unwrap_err().to_string(); + assert!(err.contains("x-ms-range-get-content-crc64 header or parameter is not supported in Azurite strict mode"), "{}", err); + } else { + let bytes = range_result.unwrap(); + assert_eq!(bytes, expected_data.slice(range)); + + // Should be a non-fatal error + out_of_range_result.unwrap_err(); + } + + let head = storage.head(&location).await?; + assert_eq!(head.size, expected_data.len()); + + storage.delete(&location).await?; + + let content_list = flatten_list_stream(storage, None).await?; + assert!(content_list.is_empty()); + + let err = storage.get(&location).await.unwrap_err(); + assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); + + let err = storage.head(&location).await.unwrap_err(); + assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); + + // Test handling of paths containing an encoded delimiter + + let file_with_delimiter = Path::from_iter(["a", "b/c", "foo.file"]); + storage + .put(&file_with_delimiter, Bytes::from("arbitrary")) + .await + .unwrap(); + + let files = flatten_list_stream(storage, None).await.unwrap(); + assert_eq!(files, vec![file_with_delimiter.clone()]); + + let files = flatten_list_stream(storage, Some(&Path::from("a/b"))) + .await + .unwrap(); + assert!(files.is_empty()); + + let files = storage + .list_with_delimiter(Some(&Path::from("a/b"))) + .await + .unwrap(); + assert!(files.common_prefixes.is_empty()); + assert!(files.objects.is_empty()); + + let files = storage + .list_with_delimiter(Some(&Path::from("a"))) + .await + .unwrap(); + assert_eq!(files.common_prefixes, vec![Path::from_iter(["a", "b/c"])]); + assert!(files.objects.is_empty()); + + let files = storage + .list_with_delimiter(Some(&Path::from_iter(["a", "b/c"]))) + .await + .unwrap(); + assert!(files.common_prefixes.is_empty()); + assert_eq!(files.objects.len(), 1); + assert_eq!(files.objects[0].location, file_with_delimiter); + + storage.delete(&file_with_delimiter).await.unwrap(); + + // Test handling of paths containing non-ASCII characters, e.g. emoji + + let emoji_prefix = Path::from("🙀"); + let emoji_file = Path::from("🙀/😀.parquet"); + storage + .put(&emoji_file, Bytes::from("arbitrary")) + .await + .unwrap(); + + storage.head(&emoji_file).await.unwrap(); + storage + .get(&emoji_file) + .await + .unwrap() + .bytes() + .await + .unwrap(); + + let files = flatten_list_stream(storage, Some(&emoji_prefix)) + .await + .unwrap(); + + assert_eq!(files, vec![emoji_file.clone()]); + + storage.delete(&emoji_file).await.unwrap(); + let files = flatten_list_stream(storage, Some(&emoji_prefix)) + .await + .unwrap(); + assert!(files.is_empty()); + + Ok(()) + } + + pub(crate) async fn list_uses_directories_correctly(storage: &DynObjectStore) -> Result<()> { + delete_fixtures(storage).await; + + let content_list = flatten_list_stream(storage, None).await?; + assert!( + content_list.is_empty(), + "Expected list to be empty; found: {:?}", + content_list + ); + + let location1 = Path::from("foo/x.json"); + let location2 = Path::from("foo.bar/y.json"); + + let data = Bytes::from("arbitrary data"); + storage.put(&location1, data.clone()).await?; + storage.put(&location2, data).await?; + + let prefix = Path::from("foo"); + let content_list = flatten_list_stream(storage, Some(&prefix)).await?; + assert_eq!(content_list, &[location1.clone()]); + + let prefix = Path::from("foo/x"); + let content_list = flatten_list_stream(storage, Some(&prefix)).await?; + assert_eq!(content_list, &[]); + + Ok(()) + } + + pub(crate) async fn list_with_delimiter(storage: &DynObjectStore) -> Result<()> { + delete_fixtures(storage).await; + + // ==================== check: store is empty ==================== + let content_list = flatten_list_stream(storage, None).await?; + assert!(content_list.is_empty()); + + // ==================== do: create files ==================== + let data = Bytes::from("arbitrary data"); + + let files: Vec<_> = [ + "test_file", + "mydb/wb/000/000/000.segment", + "mydb/wb/000/000/001.segment", + "mydb/wb/000/000/002.segment", + "mydb/wb/001/001/000.segment", + "mydb/wb/foo.json", + "mydb/wbwbwb/111/222/333.segment", + "mydb/data/whatevs", + ] + .iter() + .map(|&s| Path::from(s)) + .collect(); + + for f in &files { + let data = data.clone(); + storage.put(f, data).await.unwrap(); + } + + // ==================== check: prefix-list `mydb/wb` (directory) ==================== + let prefix = Path::from("mydb/wb"); + + let expected_000 = Path::from("mydb/wb/000"); + let expected_001 = Path::from("mydb/wb/001"); + let expected_location = Path::from("mydb/wb/foo.json"); + + let result = storage.list_with_delimiter(Some(&prefix)).await.unwrap(); + + assert_eq!(result.common_prefixes, vec![expected_000, expected_001]); + assert_eq!(result.objects.len(), 1); + + let object = &result.objects[0]; + + assert_eq!(object.location, expected_location); + assert_eq!(object.size, data.len()); + + // ==================== check: prefix-list `mydb/wb/000/000/001` (partial filename doesn't match) ==================== + let prefix = Path::from("mydb/wb/000/000/001"); + + let result = storage.list_with_delimiter(Some(&prefix)).await.unwrap(); + assert!(result.common_prefixes.is_empty()); + assert_eq!(result.objects.len(), 0); + + // ==================== check: prefix-list `not_there` (non-existing prefix) ==================== + let prefix = Path::from("not_there"); + + let result = storage.list_with_delimiter(Some(&prefix)).await.unwrap(); + assert!(result.common_prefixes.is_empty()); + assert!(result.objects.is_empty()); + + // ==================== do: remove all files ==================== + for f in &files { + storage.delete(f).await.unwrap(); + } + + // ==================== check: store is empty ==================== + let content_list = flatten_list_stream(storage, None).await?; + assert!(content_list.is_empty()); + + Ok(()) + } + + pub(crate) async fn get_nonexistent_object( + storage: &DynObjectStore, + location: Option, + ) -> crate::Result { + let location = location.unwrap_or_else(|| Path::from("this_file_should_not_exist")); + + let err = storage.head(&location).await.unwrap_err(); + assert!(matches!(err, crate::Error::NotFound { .. })); + + storage.get(&location).await?.bytes().await + } + + pub(crate) async fn rename_and_copy(storage: &DynObjectStore) -> Result<()> { + // Create two objects + let path1 = Path::from("test1"); + let path2 = Path::from("test2"); + let contents1 = Bytes::from("cats"); + let contents2 = Bytes::from("dogs"); + + // copy() make both objects identical + storage.put(&path1, contents1.clone()).await?; + storage.put(&path2, contents2.clone()).await?; + storage.copy(&path1, &path2).await?; + let new_contents = storage.get(&path2).await?.bytes().await?; + assert_eq!(&new_contents, &contents1); + + // rename() copies contents and deletes original + storage.put(&path1, contents1.clone()).await?; + storage.put(&path2, contents2.clone()).await?; + storage.rename(&path1, &path2).await?; + let new_contents = storage.get(&path2).await?.bytes().await?; + assert_eq!(&new_contents, &contents1); + let result = storage.get(&path1).await; + assert!(result.is_err()); + assert!(matches!(result.unwrap_err(), crate::Error::NotFound { .. })); + + // Clean up + storage.delete(&path2).await?; + + Ok(()) + } + + pub(crate) async fn copy_if_not_exists(storage: &DynObjectStore) -> Result<()> { + // Create two objects + let path1 = Path::from("test1"); + let path2 = Path::from("test2"); + let contents1 = Bytes::from("cats"); + let contents2 = Bytes::from("dogs"); + + // copy_if_not_exists() errors if destination already exists + storage.put(&path1, contents1.clone()).await?; + storage.put(&path2, contents2.clone()).await?; + let result = storage.copy_if_not_exists(&path1, &path2).await; + assert!(result.is_err()); + assert!(matches!( + result.unwrap_err(), + crate::Error::AlreadyExists { .. } + )); + + // copy_if_not_exists() copies contents and allows deleting original + storage.delete(&path2).await?; + storage.copy_if_not_exists(&path1, &path2).await?; + storage.delete(&path1).await?; + let new_contents = storage.get(&path2).await?.bytes().await?; + assert_eq!(&new_contents, &contents1); + let result = storage.get(&path1).await; + assert!(result.is_err()); + assert!(matches!(result.unwrap_err(), crate::Error::NotFound { .. })); + + // Clean up + storage.delete(&path2).await?; + + Ok(()) + } + + async fn delete_fixtures(storage: &DynObjectStore) { + let paths = flatten_list_stream(storage, None).await.unwrap(); + + for f in &paths { + let _ = storage.delete(f).await; + } + } + + /// Test that the returned stream does not borrow the lifetime of Path + async fn list_store<'a>( + store: &'a dyn ObjectStore, + path_str: &str, + ) -> super::Result>> { + let path = Path::from(path_str); + store.list(Some(&path)).await + } + + #[tokio::test] + async fn test_list_lifetimes() { + let store = memory::InMemory::new(); + let stream = list_store(&store, "path").await.unwrap(); + assert_eq!(stream.count().await, 0); + } + + // Tests TODO: + // GET nonexisting location (in_memory/file) + // DELETE nonexisting location + // PUT overwriting +} diff --git a/object_store_rs/src/local.rs b/object_store_rs/src/local.rs new file mode 100644 index 000000000000..b60dcb425d5b --- /dev/null +++ b/object_store_rs/src/local.rs @@ -0,0 +1,746 @@ +//! An object store implementation for a local filesystem +use crate::{ + maybe_spawn_blocking, + path::{filesystem_path_to_url, Path}, + GetResult, ListResult, ObjectMeta, ObjectStore, Result, +}; +use async_trait::async_trait; +use bytes::Bytes; +use futures::{stream::BoxStream, StreamExt}; +use snafu::{ensure, OptionExt, ResultExt, Snafu}; +use std::collections::VecDeque; +use std::fs::File; +use std::io::{Read, Seek, SeekFrom, Write}; +use std::ops::Range; +use std::sync::Arc; +use std::{collections::BTreeSet, convert::TryFrom, io}; +use url::Url; +use walkdir::{DirEntry, WalkDir}; + +/// A specialized `Error` for filesystem object store-related errors +#[derive(Debug, Snafu)] +#[allow(missing_docs)] +pub(crate) enum Error { + #[snafu(display("File size for {} did not fit in a usize: {}", path, source))] + FileSizeOverflowedUsize { + source: std::num::TryFromIntError, + path: String, + }, + + #[snafu(display("Unable to walk dir: {}", source))] + UnableToWalkDir { + source: walkdir::Error, + }, + + #[snafu(display("Unable to access metadata for {}: {}", path, source))] + UnableToAccessMetadata { + source: Box, + path: String, + }, + + #[snafu(display("Unable to copy data to file: {}", source))] + UnableToCopyDataToFile { + source: io::Error, + }, + + #[snafu(display("Unable to create dir {}: {}", path.display(), source))] + UnableToCreateDir { + source: io::Error, + path: std::path::PathBuf, + }, + + #[snafu(display("Unable to create file {}: {}", path.display(), err))] + UnableToCreateFile { + path: std::path::PathBuf, + err: io::Error, + }, + + #[snafu(display("Unable to delete file {}: {}", path.display(), source))] + UnableToDeleteFile { + source: io::Error, + path: std::path::PathBuf, + }, + + #[snafu(display("Unable to open file {}: {}", path.display(), source))] + UnableToOpenFile { + source: io::Error, + path: std::path::PathBuf, + }, + + #[snafu(display("Unable to read data from file {}: {}", path.display(), source))] + UnableToReadBytes { + source: io::Error, + path: std::path::PathBuf, + }, + + #[snafu(display("Out of range of file {}, expected: {}, actual: {}", path.display(), expected, actual))] + OutOfRange { + path: std::path::PathBuf, + expected: usize, + actual: usize, + }, + + #[snafu(display("Unable to copy file from {} to {}: {}", from.display(), to.display(), source))] + UnableToCopyFile { + from: std::path::PathBuf, + to: std::path::PathBuf, + source: io::Error, + }, + + NotFound { + path: std::path::PathBuf, + source: io::Error, + }, + + #[snafu(display("Error seeking file {}: {}", path.display(), source))] + Seek { + source: io::Error, + path: std::path::PathBuf, + }, + + #[snafu(display("Unable to convert URL \"{}\" to filesystem path", url))] + InvalidUrl { + url: Url, + }, + + AlreadyExists { + path: String, + source: io::Error, + }, +} + +impl From for super::Error { + fn from(source: Error) -> Self { + match source { + Error::NotFound { path, source } => Self::NotFound { + path: path.to_string_lossy().to_string(), + source: source.into(), + }, + Error::AlreadyExists { path, source } => Self::AlreadyExists { + path, + source: source.into(), + }, + _ => Self::Generic { + store: "LocalFileSystem", + source: Box::new(source), + }, + } + } +} + +/// Local filesystem storage providing an [`ObjectStore`] interface to files on +/// local disk. Can optionally be created with a directory prefix +/// +/// # Path Semantics +/// +/// This implementation follows the [file URI] scheme outlined in [RFC 3986]. In +/// particular paths are delimited by `/` +/// +/// [file URI]: https://en.wikipedia.org/wiki/File_URI_scheme +/// [RFC 3986]: https://www.rfc-editor.org/rfc/rfc3986 +/// +/// # Tokio Compatibility +/// +/// Tokio discourages performing blocking IO on a tokio worker thread, however, +/// no major operating systems have stable async file APIs. Therefore if called from +/// a tokio context, this will use [`tokio::runtime::Handle::spawn_blocking`] to dispatch +/// IO to a blocking thread pool, much like `tokio::fs` does under-the-hood. +/// +/// If not called from a tokio context, this will perform IO on the current thread with +/// no additional complexity or overheads +#[derive(Debug)] +pub struct LocalFileSystem { + config: Arc, +} + +#[derive(Debug)] +struct Config { + root: Url, +} + +impl std::fmt::Display for LocalFileSystem { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "LocalFileSystem({})", self.config.root) + } +} + +impl Default for LocalFileSystem { + fn default() -> Self { + Self::new() + } +} + +impl LocalFileSystem { + /// Create new filesystem storage with no prefix + pub fn new() -> Self { + Self { + config: Arc::new(Config { + root: Url::parse("file:///").unwrap(), + }), + } + } + + /// Create new filesystem storage with `prefix` applied to all paths + pub fn new_with_prefix(prefix: impl AsRef) -> Result { + Ok(Self { + config: Arc::new(Config { + root: filesystem_path_to_url(prefix)?, + }), + }) + } +} + +impl Config { + /// Return filesystem path of the given location + fn path_to_filesystem(&self, location: &Path) -> Result { + let mut url = self.root.clone(); + url.path_segments_mut() + .expect("url path") + .extend(location.parts()); + + url.to_file_path() + .map_err(|_| Error::InvalidUrl { url }.into()) + } + + fn filesystem_to_path(&self, location: &std::path::Path) -> Result { + Ok(Path::from_filesystem_path_with_base( + location, + Some(&self.root), + )?) + } +} + +#[async_trait] +impl ObjectStore for LocalFileSystem { + async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + let path = self.config.path_to_filesystem(location)?; + + maybe_spawn_blocking(move || { + let mut file = match File::create(&path) { + Ok(f) => f, + Err(err) if err.kind() == std::io::ErrorKind::NotFound => { + let parent = path + .parent() + .context(UnableToCreateFileSnafu { path: &path, err })?; + std::fs::create_dir_all(&parent) + .context(UnableToCreateDirSnafu { path: parent })?; + + match File::create(&path) { + Ok(f) => f, + Err(err) => return Err(Error::UnableToCreateFile { path, err }.into()), + } + } + Err(err) => return Err(Error::UnableToCreateFile { path, err }.into()), + }; + + file.write_all(&bytes) + .context(UnableToCopyDataToFileSnafu)?; + + Ok(()) + }) + .await + } + + async fn get(&self, location: &Path) -> Result { + let path = self.config.path_to_filesystem(location)?; + maybe_spawn_blocking(move || { + let file = open_file(&path)?; + Ok(GetResult::File(file, path)) + }) + .await + } + + async fn get_range(&self, location: &Path, range: Range) -> Result { + let path = self.config.path_to_filesystem(location)?; + maybe_spawn_blocking(move || { + let mut file = open_file(&path)?; + let to_read = range.end - range.start; + file.seek(SeekFrom::Start(range.start as u64)) + .context(SeekSnafu { path: &path })?; + + let mut buf = Vec::with_capacity(to_read); + let read = file + .take(to_read as u64) + .read_to_end(&mut buf) + .context(UnableToReadBytesSnafu { path: &path })?; + + ensure!( + read == to_read, + OutOfRangeSnafu { + path: &path, + expected: to_read, + actual: read + } + ); + + Ok(buf.into()) + }) + .await + } + + async fn head(&self, location: &Path) -> Result { + let path = self.config.path_to_filesystem(location)?; + let location = location.clone(); + + maybe_spawn_blocking(move || { + let file = open_file(&path)?; + let metadata = file.metadata().map_err(|e| Error::UnableToAccessMetadata { + source: e.into(), + path: location.to_string(), + })?; + + convert_metadata(metadata, location) + }) + .await + } + + async fn delete(&self, location: &Path) -> Result<()> { + let path = self.config.path_to_filesystem(location)?; + maybe_spawn_blocking(move || { + std::fs::remove_file(&path).context(UnableToDeleteFileSnafu { path })?; + Ok(()) + }) + .await + } + + async fn list(&self, prefix: Option<&Path>) -> Result>> { + let config = Arc::clone(&self.config); + + let root_path = match prefix { + Some(prefix) => config.path_to_filesystem(prefix)?, + None => self.config.root.to_file_path().unwrap(), + }; + + let walkdir = WalkDir::new(&root_path) + // Don't include the root directory itself + .min_depth(1); + + let s = + walkdir.into_iter().flat_map(move |result_dir_entry| { + match convert_walkdir_result(result_dir_entry) { + Err(e) => Some(Err(e)), + Ok(None) => None, + Ok(entry @ Some(_)) => entry + .filter(|dir_entry| dir_entry.file_type().is_file()) + .map(|entry| { + let location = config.filesystem_to_path(entry.path())?; + convert_entry(entry, location) + }), + } + }); + + // If no tokio context, return iterator directly as no + // need to perform chunked spawn_blocking reads + if tokio::runtime::Handle::try_current().is_err() { + return Ok(futures::stream::iter(s).boxed()); + } + + // Otherwise list in batches of CHUNK_SIZE + const CHUNK_SIZE: usize = 1024; + + let buffer = VecDeque::with_capacity(CHUNK_SIZE); + let stream = futures::stream::try_unfold((s, buffer), |(mut s, mut buffer)| async move { + if buffer.is_empty() { + (s, buffer) = tokio::task::spawn_blocking(move || { + for _ in 0..CHUNK_SIZE { + match s.next() { + Some(r) => buffer.push_back(r), + None => break, + } + } + (s, buffer) + }) + .await?; + } + + match buffer.pop_front() { + Some(Err(e)) => Err(e), + Some(Ok(meta)) => Ok(Some((meta, (s, buffer)))), + None => Ok(None), + } + }); + + Ok(stream.boxed()) + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { + let config = Arc::clone(&self.config); + + let prefix = prefix.cloned().unwrap_or_default(); + let resolved_prefix = config.path_to_filesystem(&prefix)?; + + maybe_spawn_blocking(move || { + let walkdir = WalkDir::new(&resolved_prefix).min_depth(1).max_depth(1); + + let mut common_prefixes = BTreeSet::new(); + let mut objects = Vec::new(); + + for entry_res in walkdir.into_iter().map(convert_walkdir_result) { + if let Some(entry) = entry_res? { + let is_directory = entry.file_type().is_dir(); + let entry_location = config.filesystem_to_path(entry.path())?; + + let mut parts = match entry_location.prefix_match(&prefix) { + Some(parts) => parts, + None => continue, + }; + + let common_prefix = match parts.next() { + Some(p) => p, + None => continue, + }; + + drop(parts); + + if is_directory { + common_prefixes.insert(prefix.child(common_prefix)); + } else { + objects.push(convert_entry(entry, entry_location)?); + } + } + } + + Ok(ListResult { + common_prefixes: common_prefixes.into_iter().collect(), + objects, + }) + }) + .await + } + + async fn copy(&self, from: &Path, to: &Path) -> Result<()> { + let from = self.config.path_to_filesystem(from)?; + let to = self.config.path_to_filesystem(to)?; + + maybe_spawn_blocking(move || { + std::fs::copy(&from, &to).context(UnableToCopyFileSnafu { from, to })?; + Ok(()) + }) + .await + } + + async fn rename(&self, from: &Path, to: &Path) -> Result<()> { + let from = self.config.path_to_filesystem(from)?; + let to = self.config.path_to_filesystem(to)?; + maybe_spawn_blocking(move || { + std::fs::rename(&from, &to).context(UnableToCopyFileSnafu { from, to })?; + Ok(()) + }) + .await + } + + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + let from = self.config.path_to_filesystem(from)?; + let to = self.config.path_to_filesystem(to)?; + + maybe_spawn_blocking(move || { + std::fs::hard_link(&from, &to).map_err(|err| match err.kind() { + io::ErrorKind::AlreadyExists => Error::AlreadyExists { + path: to.to_str().unwrap().to_string(), + source: err, + } + .into(), + _ => Error::UnableToCopyFile { + from, + to, + source: err, + } + .into(), + }) + }) + .await + } +} + +fn open_file(path: &std::path::PathBuf) -> Result { + let file = File::open(path).map_err(|e| { + if e.kind() == std::io::ErrorKind::NotFound { + Error::NotFound { + path: path.clone(), + source: e, + } + } else { + Error::UnableToOpenFile { + path: path.clone(), + source: e, + } + } + })?; + Ok(file) +} + +fn convert_entry(entry: DirEntry, location: Path) -> Result { + let metadata = entry + .metadata() + .map_err(|e| Error::UnableToAccessMetadata { + source: e.into(), + path: location.to_string(), + })?; + convert_metadata(metadata, location) +} + +fn convert_metadata(metadata: std::fs::Metadata, location: Path) -> Result { + let last_modified = metadata + .modified() + .expect("Modified file time should be supported on this platform") + .into(); + + let size = usize::try_from(metadata.len()).context(FileSizeOverflowedUsizeSnafu { + path: location.as_ref(), + })?; + + Ok(ObjectMeta { + location, + last_modified, + size, + }) +} + +/// Convert walkdir results and converts not-found errors into `None`. +fn convert_walkdir_result( + res: std::result::Result, +) -> Result> { + match res { + Ok(entry) => Ok(Some(entry)), + Err(walkdir_err) => match walkdir_err.io_error() { + Some(io_err) => match io_err.kind() { + io::ErrorKind::NotFound => Ok(None), + _ => Err(Error::UnableToWalkDir { + source: walkdir_err, + } + .into()), + }, + None => Err(Error::UnableToWalkDir { + source: walkdir_err, + } + .into()), + }, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::test_util::flatten_list_stream; + use crate::{ + tests::{ + copy_if_not_exists, get_nonexistent_object, list_uses_directories_correctly, + list_with_delimiter, put_get_delete_list, rename_and_copy, + }, + Error as ObjectStoreError, ObjectStore, + }; + use tempfile::TempDir; + + #[tokio::test] + async fn file_test() { + let root = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + + put_get_delete_list(&integration).await.unwrap(); + list_uses_directories_correctly(&integration).await.unwrap(); + list_with_delimiter(&integration).await.unwrap(); + rename_and_copy(&integration).await.unwrap(); + copy_if_not_exists(&integration).await.unwrap(); + } + + #[test] + fn test_non_tokio() { + let root = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + futures::executor::block_on(async move { + put_get_delete_list(&integration).await.unwrap(); + list_uses_directories_correctly(&integration).await.unwrap(); + list_with_delimiter(&integration).await.unwrap(); + }); + } + + #[tokio::test] + async fn creates_dir_if_not_present() { + let root = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + + let location = Path::from("nested/file/test_file"); + + let data = Bytes::from("arbitrary data"); + let expected_data = data.clone(); + + integration.put(&location, data).await.unwrap(); + + let read_data = integration + .get(&location) + .await + .unwrap() + .bytes() + .await + .unwrap(); + assert_eq!(&*read_data, expected_data); + } + + #[tokio::test] + async fn unknown_length() { + let root = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + + let location = Path::from("some_file"); + + let data = Bytes::from("arbitrary data"); + let expected_data = data.clone(); + + integration.put(&location, data).await.unwrap(); + + let read_data = integration + .get(&location) + .await + .unwrap() + .bytes() + .await + .unwrap(); + assert_eq!(&*read_data, expected_data); + } + + #[tokio::test] + #[cfg(target_family = "unix")] + async fn bubble_up_io_errors() { + use std::{fs::set_permissions, os::unix::prelude::PermissionsExt}; + + let root = TempDir::new().unwrap(); + + // make non-readable + let metadata = root.path().metadata().unwrap(); + let mut permissions = metadata.permissions(); + permissions.set_mode(0o000); + set_permissions(root.path(), permissions).unwrap(); + + let store = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + + // `list` must fail + match store.list(None).await { + Err(_) => { + // ok, error found + } + Ok(mut stream) => { + let mut any_err = false; + while let Some(res) = stream.next().await { + if res.is_err() { + any_err = true; + } + } + assert!(any_err); + } + } + + // `list_with_delimiter + assert!(store.list_with_delimiter(None).await.is_err()); + } + + const NON_EXISTENT_NAME: &str = "nonexistentname"; + + #[tokio::test] + async fn get_nonexistent_location() { + let root = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + + let location = Path::from(NON_EXISTENT_NAME); + + let err = get_nonexistent_object(&integration, Some(location)) + .await + .unwrap_err(); + if let ObjectStoreError::NotFound { path, source } = err { + let source_variant = source.downcast_ref::(); + assert!( + matches!(source_variant, Some(std::io::Error { .. }),), + "got: {:?}", + source_variant + ); + assert!(path.ends_with(NON_EXISTENT_NAME), "{}", path); + } else { + panic!("unexpected error type: {:?}", err); + } + } + + #[tokio::test] + async fn root() { + let integration = LocalFileSystem::new(); + + let canonical = std::path::Path::new("Cargo.toml").canonicalize().unwrap(); + let url = Url::from_directory_path(&canonical).unwrap(); + let path = Path::parse(url.path()).unwrap(); + + let roundtrip = integration.config.path_to_filesystem(&path).unwrap(); + + // Needed as on Windows canonicalize returns extended length path syntax + // C:\Users\circleci -> \\?\C:\Users\circleci + let roundtrip = roundtrip.canonicalize().unwrap(); + + assert_eq!(roundtrip, canonical); + + integration.head(&path).await.unwrap(); + } + + #[tokio::test] + async fn test_list_root() { + let integration = LocalFileSystem::new(); + let result = integration.list_with_delimiter(None).await; + if cfg!(target_family = "windows") { + let r = result.unwrap_err().to_string(); + assert!( + r.contains("Unable to convert URL \"file:///\" to filesystem path"), + "{}", + r + ); + } else { + result.unwrap(); + } + } + + #[tokio::test] + async fn invalid_path() { + let root = TempDir::new().unwrap(); + let root = root.path().join("🙀"); + std::fs::create_dir(root.clone()).unwrap(); + + // Invalid paths supported above root of store + let integration = LocalFileSystem::new_with_prefix(root.clone()).unwrap(); + + let directory = Path::from("directory"); + let object = directory.child("child.txt"); + let data = Bytes::from("arbitrary"); + integration.put(&object, data.clone()).await.unwrap(); + integration.head(&object).await.unwrap(); + let result = integration.get(&object).await.unwrap(); + assert_eq!(result.bytes().await.unwrap(), data); + + flatten_list_stream(&integration, None).await.unwrap(); + flatten_list_stream(&integration, Some(&directory)) + .await + .unwrap(); + + let result = integration + .list_with_delimiter(Some(&directory)) + .await + .unwrap(); + assert_eq!(result.objects.len(), 1); + assert!(result.common_prefixes.is_empty()); + assert_eq!(result.objects[0].location, object); + + let illegal = root.join("💀"); + std::fs::write(illegal, "foo").unwrap(); + + // Can list directory that doesn't contain illegal path + flatten_list_stream(&integration, Some(&directory)) + .await + .unwrap(); + + // Cannot list illegal file + let err = flatten_list_stream(&integration, None) + .await + .unwrap_err() + .to_string(); + + assert!( + err.contains("Invalid path segment - got \"💀\" expected: \"%F0%9F%92%80\""), + "{}", + err + ); + } +} diff --git a/object_store_rs/src/memory.rs b/object_store_rs/src/memory.rs new file mode 100644 index 000000000000..dcee1b330876 --- /dev/null +++ b/object_store_rs/src/memory.rs @@ -0,0 +1,277 @@ +//! An in-memory object store implementation +use crate::{path::Path, GetResult, ListResult, ObjectMeta, ObjectStore, Result}; +use async_trait::async_trait; +use bytes::Bytes; +use chrono::Utc; +use futures::{stream::BoxStream, StreamExt}; +use parking_lot::RwLock; +use snafu::{ensure, OptionExt, Snafu}; +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use std::ops::Range; + +/// A specialized `Error` for in-memory object store-related errors +#[derive(Debug, Snafu)] +#[allow(missing_docs)] +enum Error { + #[snafu(display("No data in memory found. Location: {path}"))] + NoDataInMemory { path: String }, + + #[snafu(display("Out of range"))] + OutOfRange, + + #[snafu(display("Bad range"))] + BadRange, + + #[snafu(display("Object already exists at that location: {path}"))] + AlreadyExists { path: String }, +} + +impl From for super::Error { + fn from(source: Error) -> Self { + match source { + Error::NoDataInMemory { ref path } => Self::NotFound { + path: path.into(), + source: source.into(), + }, + Error::AlreadyExists { ref path } => Self::AlreadyExists { + path: path.into(), + source: source.into(), + }, + _ => Self::Generic { + store: "InMemory", + source: Box::new(source), + }, + } + } +} + +/// In-memory storage suitable for testing or for opting out of using a cloud +/// storage provider. +#[derive(Debug, Default)] +pub struct InMemory { + storage: RwLock>, +} + +impl std::fmt::Display for InMemory { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "InMemory") + } +} + +#[async_trait] +impl ObjectStore for InMemory { + async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + self.storage.write().insert(location.clone(), bytes); + Ok(()) + } + + async fn get(&self, location: &Path) -> Result { + let data = self.get_bytes(location).await?; + + Ok(GetResult::Stream( + futures::stream::once(async move { Ok(data) }).boxed(), + )) + } + + async fn get_range(&self, location: &Path, range: Range) -> Result { + let data = self.get_bytes(location).await?; + ensure!(range.end <= data.len(), OutOfRangeSnafu); + ensure!(range.start <= range.end, BadRangeSnafu); + + Ok(data.slice(range)) + } + + async fn head(&self, location: &Path) -> Result { + let last_modified = Utc::now(); + let bytes = self.get_bytes(location).await?; + Ok(ObjectMeta { + location: location.clone(), + last_modified, + size: bytes.len(), + }) + } + + async fn delete(&self, location: &Path) -> Result<()> { + self.storage.write().remove(location); + Ok(()) + } + + async fn list(&self, prefix: Option<&Path>) -> Result>> { + let last_modified = Utc::now(); + + let storage = self.storage.read(); + let values: Vec<_> = storage + .iter() + .filter(move |(key, _)| prefix.map(|p| key.prefix_matches(p)).unwrap_or(true)) + .map(move |(key, value)| { + Ok(ObjectMeta { + location: key.clone(), + last_modified, + size: value.len(), + }) + }) + .collect(); + + Ok(futures::stream::iter(values).boxed()) + } + + /// The memory implementation returns all results, as opposed to the cloud + /// versions which limit their results to 1k or more because of API + /// limitations. + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { + let root = Path::default(); + let prefix = prefix.unwrap_or(&root); + + let mut common_prefixes = BTreeSet::new(); + let last_modified = Utc::now(); + + // Only objects in this base level should be returned in the + // response. Otherwise, we just collect the common prefixes. + let mut objects = vec![]; + for (k, v) in self.storage.read().range((prefix)..) { + let mut parts = match k.prefix_match(prefix) { + Some(parts) => parts, + None => break, + }; + + // Pop first element + let common_prefix = match parts.next() { + Some(p) => p, + None => continue, + }; + + if parts.next().is_some() { + common_prefixes.insert(prefix.child(common_prefix)); + } else { + let object = ObjectMeta { + location: k.clone(), + last_modified, + size: v.len(), + }; + objects.push(object); + } + } + + Ok(ListResult { + objects, + common_prefixes: common_prefixes.into_iter().collect(), + }) + } + + async fn copy(&self, from: &Path, to: &Path) -> Result<()> { + let data = self.get_bytes(from).await?; + self.storage.write().insert(to.clone(), data); + Ok(()) + } + + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + let data = self.get_bytes(from).await?; + let mut storage = self.storage.write(); + if storage.contains_key(to) { + return Err(Error::AlreadyExists { + path: to.to_string(), + } + .into()); + } + storage.insert(to.clone(), data); + Ok(()) + } +} + +impl InMemory { + /// Create new in-memory storage. + pub fn new() -> Self { + Self::default() + } + + /// Creates a clone of the store + pub async fn clone(&self) -> Self { + let storage = self.storage.read(); + let storage = storage.clone(); + + Self { + storage: RwLock::new(storage), + } + } + + async fn get_bytes(&self, location: &Path) -> Result { + let storage = self.storage.read(); + let bytes = storage + .get(location) + .cloned() + .context(NoDataInMemorySnafu { + path: location.to_string(), + })?; + Ok(bytes) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::{ + tests::{ + copy_if_not_exists, get_nonexistent_object, list_uses_directories_correctly, + list_with_delimiter, put_get_delete_list, rename_and_copy, + }, + Error as ObjectStoreError, ObjectStore, + }; + + #[tokio::test] + async fn in_memory_test() { + let integration = InMemory::new(); + + put_get_delete_list(&integration).await.unwrap(); + list_uses_directories_correctly(&integration).await.unwrap(); + list_with_delimiter(&integration).await.unwrap(); + rename_and_copy(&integration).await.unwrap(); + copy_if_not_exists(&integration).await.unwrap(); + } + + #[tokio::test] + async fn unknown_length() { + let integration = InMemory::new(); + + let location = Path::from("some_file"); + + let data = Bytes::from("arbitrary data"); + let expected_data = data.clone(); + + integration.put(&location, data).await.unwrap(); + + let read_data = integration + .get(&location) + .await + .unwrap() + .bytes() + .await + .unwrap(); + assert_eq!(&*read_data, expected_data); + } + + const NON_EXISTENT_NAME: &str = "nonexistentname"; + + #[tokio::test] + async fn nonexistent_location() { + let integration = InMemory::new(); + + let location = Path::from(NON_EXISTENT_NAME); + + let err = get_nonexistent_object(&integration, Some(location)) + .await + .unwrap_err(); + if let ObjectStoreError::NotFound { path, source } = err { + let source_variant = source.downcast_ref::(); + assert!( + matches!(source_variant, Some(Error::NoDataInMemory { .. }),), + "got: {:?}", + source_variant + ); + assert_eq!(path, NON_EXISTENT_NAME); + } else { + panic!("unexpected error type: {:?}", err); + } + } +} diff --git a/object_store_rs/src/oauth.rs b/object_store_rs/src/oauth.rs new file mode 100644 index 000000000000..173986ea617e --- /dev/null +++ b/object_store_rs/src/oauth.rs @@ -0,0 +1,198 @@ +use crate::token::TemporaryToken; +use reqwest::{Client, Method}; +use ring::signature::RsaKeyPair; +use snafu::{ResultExt, Snafu}; +use std::time::{Duration, Instant}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("No RSA key found in pem file"))] + MissingKey, + + #[snafu(display("Invalid RSA key: {}", source), context(false))] + InvalidKey { source: ring::error::KeyRejected }, + + #[snafu(display("Error signing jwt: {}", source))] + Sign { source: ring::error::Unspecified }, + + #[snafu(display("Error encoding jwt payload: {}", source))] + Encode { source: serde_json::Error }, + + #[snafu(display("Unsupported key encoding: {}", encoding))] + UnsupportedKey { encoding: String }, + + #[snafu(display("Error performing token request: {}", source))] + TokenRequest { source: reqwest::Error }, +} + +pub type Result = std::result::Result; + +#[derive(Debug, Default, serde::Serialize)] +pub struct JwtHeader { + /// The type of JWS: it can only be "JWT" here + /// + /// Defined in [RFC7515#4.1.9](https://tools.ietf.org/html/rfc7515#section-4.1.9). + #[serde(skip_serializing_if = "Option::is_none")] + pub typ: Option, + /// The algorithm used + /// + /// Defined in [RFC7515#4.1.1](https://tools.ietf.org/html/rfc7515#section-4.1.1). + pub alg: String, + /// Content type + /// + /// Defined in [RFC7519#5.2](https://tools.ietf.org/html/rfc7519#section-5.2). + #[serde(skip_serializing_if = "Option::is_none")] + pub cty: Option, + /// JSON Key URL + /// + /// Defined in [RFC7515#4.1.2](https://tools.ietf.org/html/rfc7515#section-4.1.2). + #[serde(skip_serializing_if = "Option::is_none")] + pub jku: Option, + /// Key ID + /// + /// Defined in [RFC7515#4.1.4](https://tools.ietf.org/html/rfc7515#section-4.1.4). + #[serde(skip_serializing_if = "Option::is_none")] + pub kid: Option, + /// X.509 URL + /// + /// Defined in [RFC7515#4.1.5](https://tools.ietf.org/html/rfc7515#section-4.1.5). + #[serde(skip_serializing_if = "Option::is_none")] + pub x5u: Option, + /// X.509 certificate thumbprint + /// + /// Defined in [RFC7515#4.1.7](https://tools.ietf.org/html/rfc7515#section-4.1.7). + #[serde(skip_serializing_if = "Option::is_none")] + pub x5t: Option, +} + +#[derive(serde::Serialize)] +struct TokenClaims<'a> { + iss: &'a str, + scope: &'a str, + aud: &'a str, + exp: u64, + iat: u64, +} + +#[derive(serde::Deserialize, Debug)] +struct TokenResponse { + access_token: String, + expires_in: u64, +} + +/// Encapsulates the logic to perform an OAuth token challenge +#[derive(Debug)] +pub struct OAuthProvider { + issuer: String, + scope: String, + audience: String, + key_pair: RsaKeyPair, + jwt_header: String, + random: ring::rand::SystemRandom, +} + +impl OAuthProvider { + /// Create a new [`OAuthProvider`] + pub fn new( + issuer: String, + private_key_pem: String, + scope: String, + audience: String, + ) -> Result { + let key_pair = decode_first_rsa_key(private_key_pem)?; + let jwt_header = b64_encode_obj(&JwtHeader { + alg: "RS256".to_string(), + ..Default::default() + })?; + + Ok(Self { + issuer, + key_pair, + scope, + audience, + jwt_header, + random: ring::rand::SystemRandom::new(), + }) + } + + /// Fetch a fresh token + pub async fn fetch_token(&self, client: &Client) -> Result> { + let now = seconds_since_epoch(); + let exp = now + 3600; + + let claims = TokenClaims { + iss: &self.issuer, + scope: &self.scope, + aud: &self.audience, + exp, + iat: now, + }; + + let claim_str = b64_encode_obj(&claims)?; + let message = [self.jwt_header.as_ref(), claim_str.as_ref()].join("."); + let mut sig_bytes = vec![0; self.key_pair.public_modulus_len()]; + self.key_pair + .sign( + &ring::signature::RSA_PKCS1_SHA256, + &self.random, + message.as_bytes(), + &mut sig_bytes, + ) + .context(SignSnafu)?; + + let signature = base64::encode_config(&sig_bytes, base64::URL_SAFE_NO_PAD); + let jwt = [message, signature].join("."); + + let body = [ + ("grant_type", "urn:ietf:params:oauth:grant-type:jwt-bearer"), + ("assertion", &jwt), + ]; + + let response: TokenResponse = client + .request(Method::POST, &self.audience) + .form(&body) + .send() + .await + .context(TokenRequestSnafu)? + .error_for_status() + .context(TokenRequestSnafu)? + .json() + .await + .context(TokenRequestSnafu)?; + + let token = TemporaryToken { + token: response.access_token, + expiry: Instant::now() + Duration::from_secs(response.expires_in), + }; + + Ok(token) + } +} + +/// Returns the number of seconds since unix epoch +fn seconds_since_epoch() -> u64 { + std::time::SystemTime::now() + .duration_since(std::time::SystemTime::UNIX_EPOCH) + .unwrap() + .as_secs() +} + +fn decode_first_rsa_key(private_key_pem: String) -> Result { + use rustls_pemfile::Item; + use std::io::{BufReader, Cursor}; + + let mut cursor = Cursor::new(private_key_pem); + let mut reader = BufReader::new(&mut cursor); + + // Reading from string is infallible + match rustls_pemfile::read_one(&mut reader).unwrap() { + Some(Item::PKCS8Key(key)) => Ok(RsaKeyPair::from_pkcs8(&key)?), + Some(Item::RSAKey(key)) => Ok(RsaKeyPair::from_der(&key)?), + _ => Err(Error::MissingKey), + } +} + +fn b64_encode_obj(obj: &T) -> Result { + let string = serde_json::to_string(obj).context(EncodeSnafu)?; + Ok(base64::encode_config(string, base64::URL_SAFE_NO_PAD)) +} diff --git a/object_store_rs/src/path/mod.rs b/object_store_rs/src/path/mod.rs new file mode 100644 index 000000000000..5e045aae7c4f --- /dev/null +++ b/object_store_rs/src/path/mod.rs @@ -0,0 +1,504 @@ +//! Path abstraction for Object Storage + +use itertools::Itertools; +use percent_encoding::percent_decode; +use snafu::{ensure, ResultExt, Snafu}; +use std::fmt::Formatter; +use url::Url; + +/// The delimiter to separate object namespaces, creating a directory structure. +pub const DELIMITER: &str = "/"; + +/// The path delimiter as a single byte +pub const DELIMITER_BYTE: u8 = DELIMITER.as_bytes()[0]; + +mod parts; + +pub use parts::{InvalidPart, PathPart}; + +/// Error returned by [`Path::parse`] +#[derive(Debug, Snafu)] +#[allow(missing_docs)] +pub enum Error { + #[snafu(display("Path \"{}\" contained empty path segment", path))] + EmptySegment { path: String }, + + #[snafu(display("Error parsing Path \"{}\": {}", path, source))] + BadSegment { path: String, source: InvalidPart }, + + #[snafu(display("Failed to canonicalize path \"{}\": {}", path.display(), source))] + Canonicalize { + path: std::path::PathBuf, + source: std::io::Error, + }, + + #[snafu(display("Unable to convert path \"{}\" to URL", path.display()))] + InvalidPath { path: std::path::PathBuf }, + + #[snafu(display("Path \"{}\" contained non-unicode characters: {}", path, source))] + NonUnicode { + path: String, + source: std::str::Utf8Error, + }, + + #[snafu(display("Path {} does not start with prefix {}", path, prefix))] + PrefixMismatch { path: String, prefix: String }, +} + +/// A parsed path representation that can be safely written to object storage +/// +/// # Path Safety +/// +/// In theory object stores support any UTF-8 character sequence, however, certain character +/// sequences cause compatibility problems with some applications and protocols. As such the +/// naming guidelines for [S3], [GCS] and [Azure Blob Storage] all recommend sticking to a +/// limited character subset. +/// +/// [S3]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html +/// [GCS]: https://cloud.google.com/storage/docs/naming-objects +/// [Azure Blob Storage]: https://docs.microsoft.com/en-us/rest/api/storageservices/Naming-and-Referencing-Containers--Blobs--and-Metadata#blob-names +/// +/// This presents libraries with two options for consistent path handling: +/// +/// 1. Allow constructing unsafe paths, allowing for both reading and writing of data to paths +/// that may not be consistently understood or supported +/// 2. Disallow constructing unsafe paths, ensuring data written can be consistently handled by +/// all other systems, but preventing interaction with objects at unsafe paths +/// +/// This library takes the second approach, in particular: +/// +/// * Paths are delimited by `/` +/// * Paths do not start with a `/` +/// * Empty path segments are discarded (e.g. `//` is treated as though it were `/`) +/// * Relative path segments, i.e. `.` and `..` are percent encoded +/// * Unsafe characters are percent encoded, as described by [RFC 1738] +/// * All paths are relative to the root of the object store +/// +/// In order to provide these guarantees there are two ways to safely construct a [`Path`] +/// +/// # Encode +/// +/// A string containing potentially illegal path segments can be encoded to a [`Path`] +/// using [`Path::from`] or [`Path::from_iter`]. +/// +/// ``` +/// # use object_store::path::Path; +/// assert_eq!(Path::from("foo/bar").as_ref(), "foo/bar"); +/// assert_eq!(Path::from("foo//bar").as_ref(), "foo/bar"); +/// assert_eq!(Path::from("foo/../bar").as_ref(), "foo/%2E%2E/bar"); +/// assert_eq!(Path::from_iter(["foo", "foo/bar"]).as_ref(), "foo/foo%2Fbar"); +/// ``` +/// +/// Note: if provided with an already percent encoded string, this will encode it again +/// +/// ``` +/// # use object_store::path::Path; +/// assert_eq!(Path::from("foo/foo%2Fbar").as_ref(), "foo/foo%252Fbar"); +/// ``` +/// +/// # Parse +/// +/// Alternatively a [`Path`] can be created from an existing string, returning an +/// error if it is invalid. Unlike the encoding methods, this will permit +/// valid percent encoded sequences. +/// +/// ``` +/// # use object_store::path::Path; +/// +/// assert_eq!(Path::parse("/foo/foo%2Fbar").unwrap().as_ref(), "foo/foo%2Fbar"); +/// Path::parse("..").unwrap_err(); +/// Path::parse("/foo//").unwrap_err(); +/// Path::parse("😀").unwrap_err(); +/// Path::parse("%Q").unwrap_err(); +/// ``` +/// +/// [RFC 1738]: https://www.ietf.org/rfc/rfc1738.txt +#[derive(Debug, Clone, Default, PartialEq, Eq, Hash, Ord, PartialOrd)] +pub struct Path { + /// The raw path with no leading or trailing delimiters + raw: String, +} + +impl Path { + /// Parse a string as a [`Path`], returning a [`Error`] if invalid, + /// as defined on the docstring for [`Path`] + /// + /// Note: this will strip any leading `/` or trailing `/` + pub fn parse(path: impl AsRef) -> Result { + let path = path.as_ref(); + + let stripped = path.strip_prefix(DELIMITER).unwrap_or(path); + if stripped.is_empty() { + return Ok(Default::default()); + } + + let stripped = stripped.strip_suffix(DELIMITER).unwrap_or(stripped); + + for segment in stripped.split(DELIMITER) { + ensure!(!segment.is_empty(), EmptySegmentSnafu { path }); + PathPart::parse(segment).context(BadSegmentSnafu { path })?; + } + + Ok(Self { + raw: stripped.to_string(), + }) + } + + /// Convert a filesystem path to a [`Path`] relative to the filesystem root + /// + /// This will return an error if the path does not exist, or contains illegal + /// character sequences as defined by [`Path::parse`] + pub fn from_filesystem_path(path: impl AsRef) -> Result { + Self::from_filesystem_path_with_base(path, None) + } + + /// Convert a filesystem path to a [`Path`] relative to the provided base + /// + /// This will return an error if the path does not exist on the local filesystem, + /// contains illegal character sequences as defined by [`Path::parse`], or `base` + /// does not refer to a parent path of `path` + pub(crate) fn from_filesystem_path_with_base( + path: impl AsRef, + base: Option<&Url>, + ) -> Result { + let url = filesystem_path_to_url(path)?; + let path = match base { + Some(prefix) => { + url.path() + .strip_prefix(prefix.path()) + .ok_or_else(|| Error::PrefixMismatch { + path: url.path().to_string(), + prefix: prefix.to_string(), + })? + } + None => url.path(), + }; + + // Reverse any percent encoding performed by conversion to URL + let decoded = percent_decode(path.as_bytes()) + .decode_utf8() + .context(NonUnicodeSnafu { path })?; + + Self::parse(decoded) + } + + /// Returns the [`PathPart`] of this [`Path`] + pub fn parts(&self) -> impl Iterator> { + match self.raw.is_empty() { + true => itertools::Either::Left(std::iter::empty()), + false => itertools::Either::Right( + self.raw + .split(DELIMITER) + .map(|s| PathPart { raw: s.into() }), + ), + } + } + + /// Returns an iterator of the [`PathPart`] of this [`Path`] after `prefix` + /// + /// Returns `None` if the prefix does not match + pub fn prefix_match(&self, prefix: &Self) -> Option> + '_> { + let diff = itertools::diff_with(self.parts(), prefix.parts(), |a, b| a == b); + + match diff { + // Both were equal + None => Some(itertools::Either::Left(std::iter::empty())), + // Mismatch or prefix was longer => None + Some(itertools::Diff::FirstMismatch(_, _, _) | itertools::Diff::Longer(_, _)) => None, + // Match with remaining + Some(itertools::Diff::Shorter(_, back)) => Some(itertools::Either::Right(back)), + } + } + + /// Returns true if this [`Path`] starts with `prefix` + pub fn prefix_matches(&self, prefix: &Self) -> bool { + self.prefix_match(prefix).is_some() + } + + /// Creates a new child of this [`Path`] + pub fn child<'a>(&self, child: impl Into>) -> Self { + let raw = match self.raw.is_empty() { + true => format!("{}", child.into().raw), + false => format!("{}{}{}", self.raw, DELIMITER, child.into().raw), + }; + + Self { raw } + } +} + +impl AsRef for Path { + fn as_ref(&self) -> &str { + &self.raw + } +} + +impl From<&str> for Path { + fn from(path: &str) -> Self { + Self::from_iter(path.split(DELIMITER)) + } +} + +impl From for Path { + fn from(path: String) -> Self { + Self::from_iter(path.split(DELIMITER)) + } +} + +impl From for String { + fn from(path: Path) -> Self { + path.raw + } +} + +impl std::fmt::Display for Path { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + self.raw.fmt(f) + } +} + +impl<'a, I> FromIterator for Path +where + I: Into>, +{ + fn from_iter>(iter: T) -> Self { + let raw = T::into_iter(iter) + .map(|s| s.into()) + .filter(|s| !s.raw.is_empty()) + .map(|s| s.raw) + .join(DELIMITER); + + Self { raw } + } +} + +/// Given a filesystem path, convert it to its canonical URL representation, +/// returning an error if the file doesn't exist on the local filesystem +pub(crate) fn filesystem_path_to_url(path: impl AsRef) -> Result { + let path = path.as_ref().canonicalize().context(CanonicalizeSnafu { + path: path.as_ref(), + })?; + + match path.is_dir() { + true => Url::from_directory_path(&path), + false => Url::from_file_path(&path), + } + .map_err(|_| Error::InvalidPath { path }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn cloud_prefix_with_trailing_delimiter() { + // Use case: files exist in object storage named `foo/bar.json` and + // `foo_test.json`. A search for the prefix `foo/` should return + // `foo/bar.json` but not `foo_test.json'. + let prefix = Path::from_iter(["test"]); + assert_eq!(prefix.as_ref(), "test"); + } + + #[test] + fn push_encodes() { + let location = Path::from_iter(["foo/bar", "baz%2Ftest"]); + assert_eq!(location.as_ref(), "foo%2Fbar/baz%252Ftest"); + } + + #[test] + fn test_parse() { + assert_eq!(Path::parse("/").unwrap().as_ref(), ""); + assert_eq!(Path::parse("").unwrap().as_ref(), ""); + + let err = Path::parse("//").unwrap_err(); + assert!(matches!(err, Error::EmptySegment { .. })); + + assert_eq!(Path::parse("/foo/bar/").unwrap().as_ref(), "foo/bar"); + assert_eq!(Path::parse("foo/bar/").unwrap().as_ref(), "foo/bar"); + assert_eq!(Path::parse("foo/bar").unwrap().as_ref(), "foo/bar"); + + let err = Path::parse("foo///bar").unwrap_err(); + assert!(matches!(err, Error::EmptySegment { .. })); + } + + #[test] + fn convert_raw_before_partial_eq() { + // dir and file_name + let cloud = Path::from("test_dir/test_file.json"); + let built = Path::from_iter(["test_dir", "test_file.json"]); + + assert_eq!(built, cloud); + + // dir and file_name w/o dot + let cloud = Path::from("test_dir/test_file"); + let built = Path::from_iter(["test_dir", "test_file"]); + + assert_eq!(built, cloud); + + // dir, no file + let cloud = Path::from("test_dir/"); + let built = Path::from_iter(["test_dir"]); + assert_eq!(built, cloud); + + // file_name, no dir + let cloud = Path::from("test_file.json"); + let built = Path::from_iter(["test_file.json"]); + assert_eq!(built, cloud); + + // empty + let cloud = Path::from(""); + let built = Path::from_iter(["", ""]); + + assert_eq!(built, cloud); + } + + #[test] + fn parts_after_prefix_behavior() { + let existing_path = Path::from("apple/bear/cow/dog/egg.json"); + + // Prefix with one directory + let prefix = Path::from("apple"); + let expected_parts: Vec> = vec!["bear", "cow", "dog", "egg.json"] + .into_iter() + .map(Into::into) + .collect(); + let parts: Vec<_> = existing_path.prefix_match(&prefix).unwrap().collect(); + assert_eq!(parts, expected_parts); + + // Prefix with two directories + let prefix = Path::from("apple/bear"); + let expected_parts: Vec> = vec!["cow", "dog", "egg.json"] + .into_iter() + .map(Into::into) + .collect(); + let parts: Vec<_> = existing_path.prefix_match(&prefix).unwrap().collect(); + assert_eq!(parts, expected_parts); + + // Not a prefix + let prefix = Path::from("cow"); + assert!(existing_path.prefix_match(&prefix).is_none()); + + // Prefix with a partial directory + let prefix = Path::from("ap"); + assert!(existing_path.prefix_match(&prefix).is_none()); + + // Prefix matches but there aren't any parts after it + let existing_path = Path::from("apple/bear/cow/dog"); + + let prefix = existing_path.clone(); + assert_eq!(existing_path.prefix_match(&prefix).unwrap().count(), 0); + } + + #[test] + fn prefix_matches() { + let haystack = Path::from_iter(["foo/bar", "baz%2Ftest", "something"]); + let needle = haystack.clone(); + // self starts with self + assert!( + haystack.prefix_matches(&haystack), + "{:?} should have started with {:?}", + haystack, + haystack + ); + + // a longer prefix doesn't match + let needle = needle.child("longer now"); + assert!( + !haystack.prefix_matches(&needle), + "{:?} shouldn't have started with {:?}", + haystack, + needle + ); + + // one dir prefix matches + let needle = Path::from_iter(["foo/bar"]); + assert!( + haystack.prefix_matches(&needle), + "{:?} should have started with {:?}", + haystack, + needle + ); + + // two dir prefix matches + let needle = needle.child("baz%2Ftest"); + assert!( + haystack.prefix_matches(&needle), + "{:?} should have started with {:?}", + haystack, + needle + ); + + // partial dir prefix doesn't match + let needle = Path::from_iter(["f"]); + assert!( + !haystack.prefix_matches(&needle), + "{:?} should not have started with {:?}", + haystack, + needle + ); + + // one dir and one partial dir doesn't match + let needle = Path::from_iter(["foo/bar", "baz"]); + assert!( + !haystack.prefix_matches(&needle), + "{:?} should not have started with {:?}", + haystack, + needle + ); + + // empty prefix matches + let needle = Path::from(""); + assert!( + haystack.prefix_matches(&needle), + "{:?} should have started with {:?}", + haystack, + needle + ); + } + + #[test] + fn prefix_matches_with_file_name() { + let haystack = Path::from_iter(["foo/bar", "baz%2Ftest", "something", "foo.segment"]); + + // All directories match and file name is a prefix + let needle = Path::from_iter(["foo/bar", "baz%2Ftest", "something", "foo"]); + + assert!( + !haystack.prefix_matches(&needle), + "{:?} should not have started with {:?}", + haystack, + needle + ); + + // All directories match but file name is not a prefix + let needle = Path::from_iter(["foo/bar", "baz%2Ftest", "something", "e"]); + + assert!( + !haystack.prefix_matches(&needle), + "{:?} should not have started with {:?}", + haystack, + needle + ); + + // Not all directories match; file name is a prefix of the next directory; this + // does not match + let needle = Path::from_iter(["foo/bar", "baz%2Ftest", "s"]); + + assert!( + !haystack.prefix_matches(&needle), + "{:?} should not have started with {:?}", + haystack, + needle + ); + + // Not all directories match; file name is NOT a prefix of the next directory; + // no match + let needle = Path::from_iter(["foo/bar", "baz%2Ftest", "p"]); + + assert!( + !haystack.prefix_matches(&needle), + "{:?} should not have started with {:?}", + haystack, + needle + ); + } +} diff --git a/object_store_rs/src/path/parts.rs b/object_store_rs/src/path/parts.rs new file mode 100644 index 000000000000..1ecc5dcdd286 --- /dev/null +++ b/object_store_rs/src/path/parts.rs @@ -0,0 +1,131 @@ +use percent_encoding::{percent_decode, percent_encode, AsciiSet, CONTROLS}; +use std::borrow::Cow; + +use crate::path::DELIMITER_BYTE; +use snafu::Snafu; + +/// Error returned by [`PathPart::parse`] +#[derive(Debug, Snafu)] +#[snafu(display("Invalid path segment - got \"{}\" expected: \"{}\"", actual, expected))] +#[allow(missing_copy_implementations)] +pub struct InvalidPart { + actual: String, + expected: String, +} + +/// The PathPart type exists to validate the directory/file names that form part +/// of a path. +/// +/// A PathPart instance is guaranteed to to contain no illegal characters (e.g. `/`) +/// as it can only be constructed by going through the `from` impl. +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Default, Hash)] +pub struct PathPart<'a> { + pub(super) raw: Cow<'a, str>, +} + +impl<'a> PathPart<'a> { + /// Parse the provided path segment as a [`PathPart`] returning an error if invalid + pub fn parse(segment: &'a str) -> Result { + let decoded: Cow<'a, [u8]> = percent_decode(segment.as_bytes()).into(); + let part = PathPart::from(decoded.as_ref()); + if segment != part.as_ref() { + return Err(InvalidPart { + actual: segment.to_string(), + expected: part.raw.to_string(), + }); + } + + Ok(Self { + raw: segment.into(), + }) + } +} + +/// Characters we want to encode. +const INVALID: &AsciiSet = &CONTROLS + // The delimiter we are reserving for internal hierarchy + .add(DELIMITER_BYTE) + // Characters AWS recommends avoiding for object keys + // https://docs.aws.amazon.com/AmazonS3/latest/dev/UsingMetadata.html + .add(b'\\') + .add(b'{') + .add(b'^') + .add(b'}') + .add(b'%') + .add(b'`') + .add(b']') + .add(b'"') // " <-- my editor is confused about double quotes within single quotes + .add(b'>') + .add(b'[') + .add(b'~') + .add(b'<') + .add(b'#') + .add(b'|') + // Characters Google Cloud Storage recommends avoiding for object names + // https://cloud.google.com/storage/docs/naming-objects + .add(b'\r') + .add(b'\n') + .add(b'*') + .add(b'?'); + +impl<'a> From<&'a [u8]> for PathPart<'a> { + fn from(v: &'a [u8]) -> Self { + let inner = match v { + // We don't want to encode `.` generally, but we do want to disallow parts of paths + // to be equal to `.` or `..` to prevent file system traversal shenanigans. + b"." => "%2E".into(), + b".." => "%2E%2E".into(), + other => percent_encode(other, INVALID).into(), + }; + Self { raw: inner } + } +} + +impl<'a> From<&'a str> for PathPart<'a> { + fn from(v: &'a str) -> Self { + Self::from(v.as_bytes()) + } +} + +impl From for PathPart<'static> { + fn from(s: String) -> Self { + Self { + raw: Cow::Owned(PathPart::from(s.as_str()).raw.into_owned()), + } + } +} + +impl<'a> AsRef for PathPart<'a> { + fn as_ref(&self) -> &str { + self.raw.as_ref() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn path_part_delimiter_gets_encoded() { + let part: PathPart<'_> = "foo/bar".into(); + assert_eq!(part.raw, "foo%2Fbar"); + } + + #[test] + fn path_part_given_already_encoded_string() { + let part: PathPart<'_> = "foo%2Fbar".into(); + assert_eq!(part.raw, "foo%252Fbar"); + } + + #[test] + fn path_part_cant_be_one_dot() { + let part: PathPart<'_> = ".".into(); + assert_eq!(part.raw, "%2E"); + } + + #[test] + fn path_part_cant_be_two_dots() { + let part: PathPart<'_> = "..".into(); + assert_eq!(part.raw, "%2E%2E"); + } +} diff --git a/object_store_rs/src/throttle.rs b/object_store_rs/src/throttle.rs new file mode 100644 index 000000000000..2b4087480ef3 --- /dev/null +++ b/object_store_rs/src/throttle.rs @@ -0,0 +1,498 @@ +//! A throttling object store wrapper +use parking_lot::Mutex; +use std::ops::Range; +use std::{convert::TryInto, sync::Arc}; + +use crate::{path::Path, GetResult, ListResult, ObjectMeta, ObjectStore, Result}; +use async_trait::async_trait; +use bytes::Bytes; +use futures::{stream::BoxStream, StreamExt}; +use std::time::Duration; + +/// Configuration settings for throttled store +#[derive(Debug, Default, Clone, Copy)] +pub struct ThrottleConfig { + /// Sleep duration for every call to [`delete`](ThrottledStore::delete). + /// + /// Sleeping is done before the underlying store is called and independently of the success of + /// the operation. + pub wait_delete_per_call: Duration, + + /// Sleep duration for every byte received during [`get`](ThrottledStore::get). + /// + /// Sleeping is performed after the underlying store returned and only for successful gets. The + /// sleep duration is additive to [`wait_get_per_call`](Self::wait_get_per_call). + /// + /// Note that the per-byte sleep only happens as the user consumes the output bytes. Should + /// there be an intermediate failure (i.e. after partly consuming the output bytes), the + /// resulting sleep time will be partial as well. + pub wait_get_per_byte: Duration, + + /// Sleep duration for every call to [`get`](ThrottledStore::get). + /// + /// Sleeping is done before the underlying store is called and independently of the success of + /// the operation. The sleep duration is additive to + /// [`wait_get_per_byte`](Self::wait_get_per_byte). + pub wait_get_per_call: Duration, + + /// Sleep duration for every call to [`list`](ThrottledStore::list). + /// + /// Sleeping is done before the underlying store is called and independently of the success of + /// the operation. The sleep duration is additive to + /// [`wait_list_per_entry`](Self::wait_list_per_entry). + pub wait_list_per_call: Duration, + + /// Sleep duration for every entry received during [`list`](ThrottledStore::list). + /// + /// Sleeping is performed after the underlying store returned and only for successful lists. + /// The sleep duration is additive to [`wait_list_per_call`](Self::wait_list_per_call). + /// + /// Note that the per-entry sleep only happens as the user consumes the output entries. Should + /// there be an intermediate failure (i.e. after partly consuming the output entries), the + /// resulting sleep time will be partial as well. + pub wait_list_per_entry: Duration, + + /// Sleep duration for every call to + /// [`list_with_delimiter`](ThrottledStore::list_with_delimiter). + /// + /// Sleeping is done before the underlying store is called and independently of the success of + /// the operation. The sleep duration is additive to + /// [`wait_list_with_delimiter_per_entry`](Self::wait_list_with_delimiter_per_entry). + pub wait_list_with_delimiter_per_call: Duration, + + /// Sleep duration for every entry received during + /// [`list_with_delimiter`](ThrottledStore::list_with_delimiter). + /// + /// Sleeping is performed after the underlying store returned and only for successful gets. The + /// sleep duration is additive to + /// [`wait_list_with_delimiter_per_call`](Self::wait_list_with_delimiter_per_call). + pub wait_list_with_delimiter_per_entry: Duration, + + /// Sleep duration for every call to [`put`](ThrottledStore::put). + /// + /// Sleeping is done before the underlying store is called and independently of the success of + /// the operation. + pub wait_put_per_call: Duration, +} + +/// Sleep only if non-zero duration +async fn sleep(duration: Duration) { + if !duration.is_zero() { + tokio::time::sleep(duration).await + } +} + +/// Store wrapper that wraps an inner store with some `sleep` calls. +/// +/// This can be used for performance testing. +/// +/// **Note that the behavior of the wrapper is deterministic and might not reflect real-world +/// conditions!** +#[derive(Debug)] +pub struct ThrottledStore { + inner: T, + config: Arc>, +} + +impl ThrottledStore { + /// Create new wrapper with zero waiting times. + pub fn new(inner: T, config: ThrottleConfig) -> Self { + Self { + inner, + config: Arc::new(Mutex::new(config)), + } + } + + /// Mutate config. + pub fn config_mut(&self, f: F) + where + F: Fn(&mut ThrottleConfig), + { + let mut guard = self.config.lock(); + f(&mut guard) + } + + /// Return copy of current config. + pub fn config(&self) -> ThrottleConfig { + *self.config.lock() + } +} + +impl std::fmt::Display for ThrottledStore { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "ThrottledStore({})", self.inner) + } +} + +#[async_trait] +impl ObjectStore for ThrottledStore { + async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + sleep(self.config().wait_put_per_call).await; + + self.inner.put(location, bytes).await + } + + async fn get(&self, location: &Path) -> Result { + sleep(self.config().wait_get_per_call).await; + + // need to copy to avoid moving / referencing `self` + let wait_get_per_byte = self.config().wait_get_per_byte; + + self.inner.get(location).await.map(|result| { + let s = match result { + GetResult::Stream(s) => s, + GetResult::File(_, _) => unimplemented!(), + }; + + GetResult::Stream( + s.then(move |bytes_result| async move { + match bytes_result { + Ok(bytes) => { + let bytes_len: u32 = usize_to_u32_saturate(bytes.len()); + sleep(wait_get_per_byte * bytes_len).await; + Ok(bytes) + } + Err(err) => Err(err), + } + }) + .boxed(), + ) + }) + } + + async fn get_range(&self, location: &Path, range: Range) -> Result { + let config = self.config(); + + let sleep_duration = config.wait_delete_per_call + + config.wait_get_per_byte * (range.end - range.start) as u32; + + sleep(sleep_duration).await; + + self.inner.get_range(location, range).await + } + + async fn head(&self, location: &Path) -> Result { + sleep(self.config().wait_put_per_call).await; + self.inner.head(location).await + } + + async fn delete(&self, location: &Path) -> Result<()> { + sleep(self.config().wait_delete_per_call).await; + + self.inner.delete(location).await + } + + async fn list(&self, prefix: Option<&Path>) -> Result>> { + sleep(self.config().wait_list_per_call).await; + + // need to copy to avoid moving / referencing `self` + let wait_list_per_entry = self.config().wait_list_per_entry; + + self.inner.list(prefix).await.map(|stream| { + stream + .then(move |result| async move { + match result { + Ok(entry) => { + sleep(wait_list_per_entry).await; + Ok(entry) + } + Err(err) => Err(err), + } + }) + .boxed() + }) + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { + sleep(self.config().wait_list_with_delimiter_per_call).await; + + match self.inner.list_with_delimiter(prefix).await { + Ok(list_result) => { + let entries_len = usize_to_u32_saturate(list_result.objects.len()); + sleep(self.config().wait_list_with_delimiter_per_entry * entries_len).await; + Ok(list_result) + } + Err(err) => Err(err), + } + } + + async fn copy(&self, from: &Path, to: &Path) -> Result<()> { + sleep(self.config().wait_put_per_call).await; + + self.inner.copy(from, to).await + } + + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + sleep(self.config().wait_put_per_call).await; + + self.inner.copy_if_not_exists(from, to).await + } +} + +/// Saturated `usize` to `u32` cast. +fn usize_to_u32_saturate(x: usize) -> u32 { + x.try_into().unwrap_or(u32::MAX) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{ + memory::InMemory, + tests::{ + copy_if_not_exists, list_uses_directories_correctly, list_with_delimiter, + put_get_delete_list, rename_and_copy, + }, + }; + use bytes::Bytes; + use futures::TryStreamExt; + use tokio::time::Duration; + use tokio::time::Instant; + + const WAIT_TIME: Duration = Duration::from_millis(100); + const ZERO: Duration = Duration::from_millis(0); // Duration::default isn't constant + + macro_rules! assert_bounds { + ($d:expr, $lower:expr) => { + assert_bounds!($d, $lower, $lower + 1); + }; + ($d:expr, $lower:expr, $upper:expr) => { + let d = $d; + let lower = $lower * WAIT_TIME; + let upper = $upper * WAIT_TIME; + assert!(d >= lower, "{:?} must be >= than {:?}", d, lower); + assert!(d < upper, "{:?} must be < than {:?}", d, upper); + }; + } + + #[tokio::test] + async fn throttle_test() { + let inner = InMemory::new(); + let store = ThrottledStore::new(inner, ThrottleConfig::default()); + + put_get_delete_list(&store).await.unwrap(); + list_uses_directories_correctly(&store).await.unwrap(); + list_with_delimiter(&store).await.unwrap(); + rename_and_copy(&store).await.unwrap(); + copy_if_not_exists(&store).await.unwrap(); + } + + #[tokio::test] + async fn delete_test() { + let inner = InMemory::new(); + let store = ThrottledStore::new(inner, ThrottleConfig::default()); + + assert_bounds!(measure_delete(&store, None).await, 0); + assert_bounds!(measure_delete(&store, Some(0)).await, 0); + assert_bounds!(measure_delete(&store, Some(10)).await, 0); + + store.config_mut(|cfg| cfg.wait_delete_per_call = WAIT_TIME); + assert_bounds!(measure_delete(&store, None).await, 1); + assert_bounds!(measure_delete(&store, Some(0)).await, 1); + assert_bounds!(measure_delete(&store, Some(10)).await, 1); + } + + #[tokio::test] + async fn get_test() { + let inner = InMemory::new(); + let store = ThrottledStore::new(inner, ThrottleConfig::default()); + + assert_bounds!(measure_get(&store, None).await, 0); + assert_bounds!(measure_get(&store, Some(0)).await, 0); + assert_bounds!(measure_get(&store, Some(10)).await, 0); + + store.config_mut(|cfg| cfg.wait_get_per_call = WAIT_TIME); + assert_bounds!(measure_get(&store, None).await, 1); + assert_bounds!(measure_get(&store, Some(0)).await, 1); + assert_bounds!(measure_get(&store, Some(10)).await, 1); + + store.config_mut(|cfg| { + cfg.wait_get_per_call = ZERO; + cfg.wait_get_per_byte = WAIT_TIME; + }); + assert_bounds!(measure_get(&store, Some(2)).await, 2); + + store.config_mut(|cfg| { + cfg.wait_get_per_call = WAIT_TIME; + cfg.wait_get_per_byte = WAIT_TIME; + }); + assert_bounds!(measure_get(&store, Some(2)).await, 3); + } + + #[tokio::test] + async fn list_test() { + let inner = InMemory::new(); + let store = ThrottledStore::new(inner, ThrottleConfig::default()); + + assert_bounds!(measure_list(&store, 0).await, 0); + assert_bounds!(measure_list(&store, 10).await, 0); + + store.config_mut(|cfg| cfg.wait_list_per_call = WAIT_TIME); + assert_bounds!(measure_list(&store, 0).await, 1); + assert_bounds!(measure_list(&store, 10).await, 1); + + store.config_mut(|cfg| { + cfg.wait_list_per_call = ZERO; + cfg.wait_list_per_entry = WAIT_TIME; + }); + assert_bounds!(measure_list(&store, 2).await, 2); + + store.config_mut(|cfg| { + cfg.wait_list_per_call = WAIT_TIME; + cfg.wait_list_per_entry = WAIT_TIME; + }); + assert_bounds!(measure_list(&store, 2).await, 3); + } + + #[tokio::test] + async fn list_with_delimiter_test() { + let inner = InMemory::new(); + let store = ThrottledStore::new(inner, ThrottleConfig::default()); + + assert_bounds!(measure_list_with_delimiter(&store, 0).await, 0); + assert_bounds!(measure_list_with_delimiter(&store, 10).await, 0); + + store.config_mut(|cfg| cfg.wait_list_with_delimiter_per_call = WAIT_TIME); + assert_bounds!(measure_list_with_delimiter(&store, 0).await, 1); + assert_bounds!(measure_list_with_delimiter(&store, 10).await, 1); + + store.config_mut(|cfg| { + cfg.wait_list_with_delimiter_per_call = ZERO; + cfg.wait_list_with_delimiter_per_entry = WAIT_TIME; + }); + assert_bounds!(measure_list_with_delimiter(&store, 2).await, 2); + + store.config_mut(|cfg| { + cfg.wait_list_with_delimiter_per_call = WAIT_TIME; + cfg.wait_list_with_delimiter_per_entry = WAIT_TIME; + }); + assert_bounds!(measure_list_with_delimiter(&store, 2).await, 3); + } + + #[tokio::test] + async fn put_test() { + let inner = InMemory::new(); + let store = ThrottledStore::new(inner, ThrottleConfig::default()); + + assert_bounds!(measure_put(&store, 0).await, 0); + assert_bounds!(measure_put(&store, 10).await, 0); + + store.config_mut(|cfg| cfg.wait_put_per_call = WAIT_TIME); + assert_bounds!(measure_put(&store, 0).await, 1); + assert_bounds!(measure_put(&store, 10).await, 1); + + store.config_mut(|cfg| cfg.wait_put_per_call = ZERO); + assert_bounds!(measure_put(&store, 0).await, 0); + } + + async fn place_test_object(store: &ThrottledStore, n_bytes: Option) -> Path { + let path = Path::from("foo"); + + if let Some(n_bytes) = n_bytes { + let data: Vec<_> = std::iter::repeat(1u8).take(n_bytes).collect(); + let bytes = Bytes::from(data); + store.put(&path, bytes).await.unwrap(); + } else { + // ensure object is absent + store.delete(&path).await.unwrap(); + } + + path + } + + async fn place_test_objects(store: &ThrottledStore, n_entries: usize) -> Path { + let prefix = Path::from("foo"); + + // clean up store + let entries: Vec<_> = store + .list(Some(&prefix)) + .await + .unwrap() + .try_collect() + .await + .unwrap(); + + for entry in entries { + store.delete(&entry.location).await.unwrap(); + } + + // create new entries + for i in 0..n_entries { + let path = prefix.child(i.to_string().as_str()); + + let data = Bytes::from("bar"); + store.put(&path, data).await.unwrap(); + } + + prefix + } + + async fn measure_delete(store: &ThrottledStore, n_bytes: Option) -> Duration { + let path = place_test_object(store, n_bytes).await; + + let t0 = Instant::now(); + store.delete(&path).await.unwrap(); + + t0.elapsed() + } + + async fn measure_get(store: &ThrottledStore, n_bytes: Option) -> Duration { + let path = place_test_object(store, n_bytes).await; + + let t0 = Instant::now(); + let res = store.get(&path).await; + if n_bytes.is_some() { + // need to consume bytes to provoke sleep times + let s = match res.unwrap() { + GetResult::Stream(s) => s, + GetResult::File(_, _) => unimplemented!(), + }; + + s.map_ok(|b| bytes::BytesMut::from(&b[..])) + .try_concat() + .await + .unwrap(); + } else { + assert!(res.is_err()); + } + + t0.elapsed() + } + + async fn measure_list(store: &ThrottledStore, n_entries: usize) -> Duration { + let prefix = place_test_objects(store, n_entries).await; + + let t0 = Instant::now(); + store + .list(Some(&prefix)) + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + + t0.elapsed() + } + + async fn measure_list_with_delimiter( + store: &ThrottledStore, + n_entries: usize, + ) -> Duration { + let prefix = place_test_objects(store, n_entries).await; + + let t0 = Instant::now(); + store.list_with_delimiter(Some(&prefix)).await.unwrap(); + + t0.elapsed() + } + + async fn measure_put(store: &ThrottledStore, n_bytes: usize) -> Duration { + let data: Vec<_> = std::iter::repeat(1u8).take(n_bytes).collect(); + let bytes = Bytes::from(data); + + let t0 = Instant::now(); + store.put(&Path::from("foo"), bytes).await.unwrap(); + + t0.elapsed() + } +} diff --git a/object_store_rs/src/token.rs b/object_store_rs/src/token.rs new file mode 100644 index 000000000000..4e7aa84c54bf --- /dev/null +++ b/object_store_rs/src/token.rs @@ -0,0 +1,47 @@ +use std::future::Future; +use std::time::Instant; +use tokio::sync::Mutex; + +/// A temporary authentication token with an associated expiry +#[derive(Debug, Clone)] +pub struct TemporaryToken { + /// The temporary credential + pub token: T, + /// The instant at which this credential is no longer valid + pub expiry: Instant, +} + +/// Provides [`TokenCache::get_or_insert_with`] which can be used to cache a +/// [`TemporaryToken`] based on its expiry +#[derive(Debug, Default)] +pub struct TokenCache { + cache: Mutex>>, +} + +impl TokenCache { + pub async fn get_or_insert_with(&self, f: F) -> Result + where + F: FnOnce() -> Fut + Send, + Fut: Future, E>> + Send, + { + let now = Instant::now(); + let mut locked = self.cache.lock().await; + + if let Some(cached) = locked.as_ref() { + let delta = cached + .expiry + .checked_duration_since(now) + .unwrap_or_default(); + + if delta.as_secs() > 300 { + return Ok(cached.token.clone()); + } + } + + let cached = f().await?; + let token = cached.token.clone(); + *locked = Some(cached); + + Ok(token) + } +} diff --git a/object_store_rs/src/util.rs b/object_store_rs/src/util.rs new file mode 100644 index 000000000000..7671da3d84c7 --- /dev/null +++ b/object_store_rs/src/util.rs @@ -0,0 +1,56 @@ +//! Common logic for interacting with remote object stores +use super::Result; +use bytes::Bytes; +use futures::{stream::StreamExt, Stream}; + +/// Returns the prefix to be passed to an object store +#[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] +pub fn format_prefix(prefix: Option<&crate::path::Path>) -> Option { + prefix + .filter(|x| !x.as_ref().is_empty()) + .map(|p| format!("{}{}", p.as_ref(), crate::path::DELIMITER)) +} + +/// Returns a formatted HTTP range header as per +/// +#[cfg(any(feature = "aws", feature = "gcp"))] +pub fn format_http_range(range: std::ops::Range) -> String { + format!("bytes={}-{}", range.start, range.end.saturating_sub(1)) +} + +/// Collect a stream into [`Bytes`] avoiding copying in the event of a single chunk +pub async fn collect_bytes(mut stream: S, size_hint: Option) -> Result +where + S: Stream> + Send + Unpin, +{ + let first = stream.next().await.transpose()?.unwrap_or_default(); + + // Avoid copying if single response + match stream.next().await.transpose()? { + None => Ok(first), + Some(second) => { + let size_hint = size_hint.unwrap_or_else(|| first.len() + second.len()); + + let mut buf = Vec::with_capacity(size_hint); + buf.extend_from_slice(&first); + buf.extend_from_slice(&second); + while let Some(maybe_bytes) = stream.next().await { + buf.extend_from_slice(&maybe_bytes?); + } + + Ok(buf.into()) + } + } +} + +/// Takes a function and spawns it to a tokio blocking pool if available +pub async fn maybe_spawn_blocking(f: F) -> Result +where + F: FnOnce() -> Result + Send + 'static, + T: Send + 'static, +{ + match tokio::runtime::Handle::try_current() { + Ok(runtime) => runtime.spawn_blocking(f).await?, + Err(_) => f(), + } +} From b18cfd7c6b8109bf4c143df190b88a34b6981b4d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 15 Jul 2022 12:56:34 -0400 Subject: [PATCH 02/12] Add object_store to workspace, update notes and readme --- Cargo.toml | 1 + {object_store_rs => object_store}/.circleci/config.yml | 0 {object_store_rs => object_store}/.github/dependabot.yml | 0 .../.github/workflows/semantic.yml | 0 {object_store_rs => object_store}/.gitignore | 0 {object_store_rs => object_store}/.kodiak.toml | 0 {object_store_rs => object_store}/CONTRIBUTING.md | 0 {object_store_rs => object_store}/Cargo.toml | 4 ++-- {object_store_rs => object_store}/LICENSE-APACHE | 0 {object_store_rs => object_store}/LICENSE-MIT | 0 {object_store_rs => object_store}/README.md | 0 {object_store_rs => object_store}/SECURITY.md | 0 {object_store_rs => object_store}/deny.toml | 0 {object_store_rs => object_store}/rust-toolchain.toml | 0 {object_store_rs => object_store}/rustfmt.toml | 0 {object_store_rs => object_store}/src/aws.rs | 0 {object_store_rs => object_store}/src/azure.rs | 0 {object_store_rs => object_store}/src/gcp.rs | 0 {object_store_rs => object_store}/src/lib.rs | 0 {object_store_rs => object_store}/src/local.rs | 0 {object_store_rs => object_store}/src/memory.rs | 0 {object_store_rs => object_store}/src/oauth.rs | 0 {object_store_rs => object_store}/src/path/mod.rs | 0 {object_store_rs => object_store}/src/path/parts.rs | 0 {object_store_rs => object_store}/src/throttle.rs | 0 {object_store_rs => object_store}/src/token.rs | 0 {object_store_rs => object_store}/src/util.rs | 0 27 files changed, 3 insertions(+), 2 deletions(-) rename {object_store_rs => object_store}/.circleci/config.yml (100%) rename {object_store_rs => object_store}/.github/dependabot.yml (100%) rename {object_store_rs => object_store}/.github/workflows/semantic.yml (100%) rename {object_store_rs => object_store}/.gitignore (100%) rename {object_store_rs => object_store}/.kodiak.toml (100%) rename {object_store_rs => object_store}/CONTRIBUTING.md (100%) rename {object_store_rs => object_store}/Cargo.toml (93%) rename {object_store_rs => object_store}/LICENSE-APACHE (100%) rename {object_store_rs => object_store}/LICENSE-MIT (100%) rename {object_store_rs => object_store}/README.md (100%) rename {object_store_rs => object_store}/SECURITY.md (100%) rename {object_store_rs => object_store}/deny.toml (100%) rename {object_store_rs => object_store}/rust-toolchain.toml (100%) rename {object_store_rs => object_store}/rustfmt.toml (100%) rename {object_store_rs => object_store}/src/aws.rs (100%) rename {object_store_rs => object_store}/src/azure.rs (100%) rename {object_store_rs => object_store}/src/gcp.rs (100%) rename {object_store_rs => object_store}/src/lib.rs (100%) rename {object_store_rs => object_store}/src/local.rs (100%) rename {object_store_rs => object_store}/src/memory.rs (100%) rename {object_store_rs => object_store}/src/oauth.rs (100%) rename {object_store_rs => object_store}/src/path/mod.rs (100%) rename {object_store_rs => object_store}/src/path/parts.rs (100%) rename {object_store_rs => object_store}/src/throttle.rs (100%) rename {object_store_rs => object_store}/src/token.rs (100%) rename {object_store_rs => object_store}/src/util.rs (100%) diff --git a/Cargo.toml b/Cargo.toml index 2837f028e8c4..9bf55c0f2360 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,7 @@ members = [ "parquet_derive_test", "arrow-flight", "integration-testing", + "object_store", ] # Enable the version 2 feature resolver, which avoids unifying features for targets that are not being built # diff --git a/object_store_rs/.circleci/config.yml b/object_store/.circleci/config.yml similarity index 100% rename from object_store_rs/.circleci/config.yml rename to object_store/.circleci/config.yml diff --git a/object_store_rs/.github/dependabot.yml b/object_store/.github/dependabot.yml similarity index 100% rename from object_store_rs/.github/dependabot.yml rename to object_store/.github/dependabot.yml diff --git a/object_store_rs/.github/workflows/semantic.yml b/object_store/.github/workflows/semantic.yml similarity index 100% rename from object_store_rs/.github/workflows/semantic.yml rename to object_store/.github/workflows/semantic.yml diff --git a/object_store_rs/.gitignore b/object_store/.gitignore similarity index 100% rename from object_store_rs/.gitignore rename to object_store/.gitignore diff --git a/object_store_rs/.kodiak.toml b/object_store/.kodiak.toml similarity index 100% rename from object_store_rs/.kodiak.toml rename to object_store/.kodiak.toml diff --git a/object_store_rs/CONTRIBUTING.md b/object_store/CONTRIBUTING.md similarity index 100% rename from object_store_rs/CONTRIBUTING.md rename to object_store/CONTRIBUTING.md diff --git a/object_store_rs/Cargo.toml b/object_store/Cargo.toml similarity index 93% rename from object_store_rs/Cargo.toml rename to object_store/Cargo.toml index 762a3de96e57..b28bfd189fab 100644 --- a/object_store_rs/Cargo.toml +++ b/object_store/Cargo.toml @@ -4,13 +4,13 @@ version = "0.3.0" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" -description = "A generic object store interface" +description = "A generic object store interface for uniformly interacting with AWS S3, Google Cloud Storage and Azure Blob Storage" keywords = [ "object", "storage", "cloud", ] -repository = "https://github.com/influxdata/object_store_rs" +repository = "https://github.com/apache/arrow-rs" [package.metadata.docs.rs] all-features = true diff --git a/object_store_rs/LICENSE-APACHE b/object_store/LICENSE-APACHE similarity index 100% rename from object_store_rs/LICENSE-APACHE rename to object_store/LICENSE-APACHE diff --git a/object_store_rs/LICENSE-MIT b/object_store/LICENSE-MIT similarity index 100% rename from object_store_rs/LICENSE-MIT rename to object_store/LICENSE-MIT diff --git a/object_store_rs/README.md b/object_store/README.md similarity index 100% rename from object_store_rs/README.md rename to object_store/README.md diff --git a/object_store_rs/SECURITY.md b/object_store/SECURITY.md similarity index 100% rename from object_store_rs/SECURITY.md rename to object_store/SECURITY.md diff --git a/object_store_rs/deny.toml b/object_store/deny.toml similarity index 100% rename from object_store_rs/deny.toml rename to object_store/deny.toml diff --git a/object_store_rs/rust-toolchain.toml b/object_store/rust-toolchain.toml similarity index 100% rename from object_store_rs/rust-toolchain.toml rename to object_store/rust-toolchain.toml diff --git a/object_store_rs/rustfmt.toml b/object_store/rustfmt.toml similarity index 100% rename from object_store_rs/rustfmt.toml rename to object_store/rustfmt.toml diff --git a/object_store_rs/src/aws.rs b/object_store/src/aws.rs similarity index 100% rename from object_store_rs/src/aws.rs rename to object_store/src/aws.rs diff --git a/object_store_rs/src/azure.rs b/object_store/src/azure.rs similarity index 100% rename from object_store_rs/src/azure.rs rename to object_store/src/azure.rs diff --git a/object_store_rs/src/gcp.rs b/object_store/src/gcp.rs similarity index 100% rename from object_store_rs/src/gcp.rs rename to object_store/src/gcp.rs diff --git a/object_store_rs/src/lib.rs b/object_store/src/lib.rs similarity index 100% rename from object_store_rs/src/lib.rs rename to object_store/src/lib.rs diff --git a/object_store_rs/src/local.rs b/object_store/src/local.rs similarity index 100% rename from object_store_rs/src/local.rs rename to object_store/src/local.rs diff --git a/object_store_rs/src/memory.rs b/object_store/src/memory.rs similarity index 100% rename from object_store_rs/src/memory.rs rename to object_store/src/memory.rs diff --git a/object_store_rs/src/oauth.rs b/object_store/src/oauth.rs similarity index 100% rename from object_store_rs/src/oauth.rs rename to object_store/src/oauth.rs diff --git a/object_store_rs/src/path/mod.rs b/object_store/src/path/mod.rs similarity index 100% rename from object_store_rs/src/path/mod.rs rename to object_store/src/path/mod.rs diff --git a/object_store_rs/src/path/parts.rs b/object_store/src/path/parts.rs similarity index 100% rename from object_store_rs/src/path/parts.rs rename to object_store/src/path/parts.rs diff --git a/object_store_rs/src/throttle.rs b/object_store/src/throttle.rs similarity index 100% rename from object_store_rs/src/throttle.rs rename to object_store/src/throttle.rs diff --git a/object_store_rs/src/token.rs b/object_store/src/token.rs similarity index 100% rename from object_store_rs/src/token.rs rename to object_store/src/token.rs diff --git a/object_store_rs/src/util.rs b/object_store/src/util.rs similarity index 100% rename from object_store_rs/src/util.rs rename to object_store/src/util.rs From c11c5fc7df76d95e8a2ecf9a4f633e0af5357967 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 15 Jul 2022 13:22:50 -0400 Subject: [PATCH 03/12] Remove old github items --- object_store/.github/dependabot.yml | 15 --------------- object_store/.github/workflows/semantic.yml | 10 ---------- 2 files changed, 25 deletions(-) delete mode 100644 object_store/.github/dependabot.yml delete mode 100644 object_store/.github/workflows/semantic.yml diff --git a/object_store/.github/dependabot.yml b/object_store/.github/dependabot.yml deleted file mode 100644 index 656e622d3fce..000000000000 --- a/object_store/.github/dependabot.yml +++ /dev/null @@ -1,15 +0,0 @@ ---- -version: 2 -updates: - - package-ecosystem: "cargo" - directory: "/" - schedule: - interval: daily - open-pull-requests-limit: 10 - ignore: - # Thrift version needs to match the version of the thrift-compiler used to generate code, - # and therefore this dependency requires a more manual upgrade - # - # Additionally the thrift-compiler version available in standard repos tends to lag - # the latest release significantly, and so updating to the latest version adds friction - - dependency-name: "thrift" diff --git a/object_store/.github/workflows/semantic.yml b/object_store/.github/workflows/semantic.yml deleted file mode 100644 index 5fbae6dc995b..000000000000 --- a/object_store/.github/workflows/semantic.yml +++ /dev/null @@ -1,10 +0,0 @@ ---- -name: "Semantic PR and Commit Messages" - -on: - pull_request: - types: [opened, reopened, synchronize, edited] - -jobs: - semantic: - uses: influxdata/validate-semantic-github-messages/.github/workflows/semantic.yml@main From 5dbe7cdeeae197bd65aa86b8846a9454cc89b26d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 15 Jul 2022 13:23:09 -0400 Subject: [PATCH 04/12] Remove old gitignore --- object_store/.gitignore | 11 ----------- 1 file changed, 11 deletions(-) delete mode 100644 object_store/.gitignore diff --git a/object_store/.gitignore b/object_store/.gitignore deleted file mode 100644 index 6f38ebf95f1a..000000000000 --- a/object_store/.gitignore +++ /dev/null @@ -1,11 +0,0 @@ -**/target -**/*.rs.bk -.idea/ -.env -.gdb_history -*.tsm -**/.DS_Store -**/.vscode -massif.out.* -valgrind-out.txt -Cargo.lock From 1d42d787b490cccceb13466a7bc532b05a3f2ff6 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 15 Jul 2022 13:23:36 -0400 Subject: [PATCH 05/12] Remove kodiak config --- object_store/.kodiak.toml | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 object_store/.kodiak.toml diff --git a/object_store/.kodiak.toml b/object_store/.kodiak.toml deleted file mode 100644 index f96092d04828..000000000000 --- a/object_store/.kodiak.toml +++ /dev/null @@ -1,2 +0,0 @@ -version = 1 -merge.method = "squash" From 17a336db505cb403abdae4ea0f7bc11d62e01e4e Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 15 Jul 2022 13:24:16 -0400 Subject: [PATCH 06/12] Remove redundant license files --- object_store/LICENSE-APACHE | 201 ------------------------------------ object_store/LICENSE-MIT | 25 ----- 2 files changed, 226 deletions(-) delete mode 100644 object_store/LICENSE-APACHE delete mode 100644 object_store/LICENSE-MIT diff --git a/object_store/LICENSE-APACHE b/object_store/LICENSE-APACHE deleted file mode 100644 index 16fe87b06e80..000000000000 --- a/object_store/LICENSE-APACHE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - -TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - -1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - -2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - -3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - -4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - -5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - -6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - -7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - -8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - -9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - -END OF TERMS AND CONDITIONS - -APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - -Copyright [yyyy] [name of copyright owner] - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. diff --git a/object_store/LICENSE-MIT b/object_store/LICENSE-MIT deleted file mode 100644 index 42d41dfd83f6..000000000000 --- a/object_store/LICENSE-MIT +++ /dev/null @@ -1,25 +0,0 @@ -Copyright (c) 2020 InfluxData - -Permission is hereby granted, free of charge, to any -person obtaining a copy of this software and associated -documentation files (the "Software"), to deal in the -Software without restriction, including without -limitation the rights to use, copy, modify, merge, -publish, distribute, sublicense, and/or sell copies of -the Software, and to permit persons to whom the Software -is furnished to do so, subject to the following -conditions: - -The above copyright notice and this permission notice -shall be included in all copies or substantial portions -of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF -ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED -TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A -PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT -SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION -OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR -IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. From 674d07f53057df0f3f661dabf1349ad747e4360b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 15 Jul 2022 13:24:50 -0400 Subject: [PATCH 07/12] Remove influx specific security policy --- object_store/SECURITY.md | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 object_store/SECURITY.md diff --git a/object_store/SECURITY.md b/object_store/SECURITY.md deleted file mode 100644 index 2b708f96e6dd..000000000000 --- a/object_store/SECURITY.md +++ /dev/null @@ -1,5 +0,0 @@ -# Security Policy - -## Reporting a Vulnerability - -InfluxData takes security and our users' trust very seriously. If you believe you have found a security issue in any of our open source projects, please responsibly disclose it by contacting security@influxdata.com. More details about security vulnerability reporting, including our GPG key, can be found at https://www.influxdata.com/how-to-report-security-vulnerabilities/. From fe95da239aed2d0e014a85aa4aefe4ddc763eb4c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 15 Jul 2022 13:26:15 -0400 Subject: [PATCH 08/12] Remove redudant rust-toolchain and rustfmt --- object_store/rust-toolchain.toml | 3 --- object_store/rustfmt.toml | 6 ------ 2 files changed, 9 deletions(-) delete mode 100644 object_store/rust-toolchain.toml delete mode 100644 object_store/rustfmt.toml diff --git a/object_store/rust-toolchain.toml b/object_store/rust-toolchain.toml deleted file mode 100644 index 1e0347218dc8..000000000000 --- a/object_store/rust-toolchain.toml +++ /dev/null @@ -1,3 +0,0 @@ -[toolchain] -channel = "1.60" -components = [ "rustfmt", "clippy" ] diff --git a/object_store/rustfmt.toml b/object_store/rustfmt.toml deleted file mode 100644 index 7b060483beca..000000000000 --- a/object_store/rustfmt.toml +++ /dev/null @@ -1,6 +0,0 @@ -edition = "2021" - -# Unstable features not yet supported on stable Rust -#wrap_comments = true -#imports_granularity = "Crate" -#group_imports = "StdExternalCrate" From fb5ce62b06e1f8ad813b9e9e121094fe19bfc621 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 15 Jul 2022 13:33:31 -0400 Subject: [PATCH 09/12] Add Apache License (RAT) --- object_store/CONTRIBUTING.md | 42 ++++++------- object_store/Cargo.toml | 17 ++++++ object_store/README.md | 23 ++++++- object_store/deny.toml | 17 ++++++ object_store/src/aws.rs | 107 +++++++++++++++++++++++---------- object_store/src/azure.rs | 43 +++++++++++-- object_store/src/gcp.rs | 57 +++++++++++++++--- object_store/src/lib.rs | 90 +++++++++++++++++---------- object_store/src/local.rs | 97 ++++++++++++++++++------------ object_store/src/memory.rs | 22 ++++++- object_store/src/oauth.rs | 17 ++++++ object_store/src/path/mod.rs | 55 ++++++++++++----- object_store/src/path/parts.rs | 17 ++++++ object_store/src/throttle.rs | 50 ++++++++++++--- object_store/src/token.rs | 17 ++++++ object_store/src/util.rs | 17 ++++++ 16 files changed, 530 insertions(+), 158 deletions(-) diff --git a/object_store/CONTRIBUTING.md b/object_store/CONTRIBUTING.md index 188b7ee7de2e..2e216dd48662 100644 --- a/object_store/CONTRIBUTING.md +++ b/object_store/CONTRIBUTING.md @@ -1,25 +1,23 @@ -# Contributing - -Thank you for thinking of contributing! We very much welcome contributions from the community. To make the process -easier and more valuable for everyone involved we have a few rules and guidelines to follow. - -Anyone with a Github account is free to file issues on the project. However, if you want to contribute documentation or -code then you will need to sign InfluxData's Individual Contributor License Agreement (CLA), which can be found with -more information [on our website](https://www.influxdata.com/legal/cla/). - -## Submitting Issues and Feature Requests - -Before you file an [issue](https://github.com/influxdata/object_store_rs/issues/new), please search existing issues in -case -the same or similar issues have already been filed. If you find an existing open ticket covering your issue then please -avoid adding "👍" or "me too" comments; Github notifications can cause a lot of noise for the project maintainers who -triage the back-log. - -However, if you have a new piece of information for an existing ticket and you think it may help the investigation or -resolution, then please do add it as a comment! - -You can signal to the team that you're experiencing an existing issue with one of Github's emoji reactions (these are a -good way to add "weight" to an issue from a prioritization perspective). + + +# Development instructions ## Running Tests diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index b28bfd189fab..613b6ab2ecf3 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + [package] name = "object_store" version = "0.3.0" diff --git a/object_store/README.md b/object_store/README.md index d5745a96d271..313588b4a73b 100644 --- a/object_store/README.md +++ b/object_store/README.md @@ -1,5 +1,26 @@ + + # Rust Object Store -A crate providing a generic interface to object stores, such as S3, Azure Blob Storage and Google Cloud Storage. Originally developed for [InfluxDB IOx](https://github.com/influxdata/influxdb_iox/) and later split out for external consumption. +A crate providing a generic interface to object stores, such as S3, Azure Blob Storage and Google Cloud Storage. + +Originally developed for [InfluxDB IOx](https://github.com/influxdata/influxdb_iox/) and later split out and donated to Apache Arrow. See [docs.rs](https://docs.rs/object_store) for usage instructions diff --git a/object_store/deny.toml b/object_store/deny.toml index 02324f790019..bfd060a0b94d 100644 --- a/object_store/deny.toml +++ b/object_store/deny.toml @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + # Configuration documentation: #  https://embarkstudios.github.io/cargo-deny/index.html diff --git a/object_store/src/aws.rs b/object_store/src/aws.rs index 57bfeaf6372d..7ebcc2a88412 100644 --- a/object_store/src/aws.rs +++ b/object_store/src/aws.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + //! An object store implementation for S3 use crate::util::format_http_range; use crate::{ @@ -20,7 +37,9 @@ use rusoto_s3::S3; use rusoto_sts::WebIdentityProvider; use snafu::{OptionExt, ResultExt, Snafu}; use std::ops::Range; -use std::{convert::TryFrom, fmt, num::NonZeroUsize, ops::Deref, sync::Arc, time::Duration}; +use std::{ + convert::TryFrom, fmt, num::NonZeroUsize, ops::Deref, sync::Arc, time::Duration, +}; use tokio::sync::{OwnedSemaphorePermit, Semaphore}; use tracing::{debug, warn}; @@ -31,10 +50,18 @@ pub const MAX_NUM_RETRIES: u32 = 3; #[derive(Debug, Snafu)] #[allow(missing_docs)] enum Error { - #[snafu(display("Expected streamed data to have length {}, got {}", expected, actual))] + #[snafu(display( + "Expected streamed data to have length {}, got {}", + expected, + actual + ))] DataDoesNotMatchLength { expected: usize, actual: usize }, - #[snafu(display("Did not receive any data. Bucket: {}, Location: {}", bucket, path))] + #[snafu(display( + "Did not receive any data. Bucket: {}, Location: {}", + bucket, + path + ))] NoData { bucket: String, path: String }, #[snafu(display( @@ -270,12 +297,12 @@ impl ObjectStore for AmazonS3 { .head_object(head_request) .await .map_err(|e| match e { - rusoto_core::RusotoError::Service(rusoto_s3::HeadObjectError::NoSuchKey(_)) => { - Error::NotFound { - path: key.clone(), - source: e.into(), - } - } + rusoto_core::RusotoError::Service( + rusoto_s3::HeadObjectError::NoSuchKey(_), + ) => Error::NotFound { + path: key.clone(), + source: e.into(), + }, rusoto_core::RusotoError::Unknown(h) if h.status.as_u16() == 404 => { Error::NotFound { path: key.clone(), @@ -337,7 +364,10 @@ impl ObjectStore for AmazonS3 { Ok(()) } - async fn list(&self, prefix: Option<&Path>) -> Result>> { + async fn list( + &self, + prefix: Option<&Path>, + ) -> Result>> { Ok(self .list_objects_v2(prefix, None) .await? @@ -372,11 +402,13 @@ impl ObjectStore for AmazonS3 { res.objects.append(&mut objects); - let prefixes = list_objects_v2_result.common_prefixes.unwrap_or_default(); + let prefixes = + list_objects_v2_result.common_prefixes.unwrap_or_default(); res.common_prefixes.reserve(prefixes.len()); for p in prefixes { - let prefix = p.prefix.expect("can't have a prefix without a value"); + let prefix = + p.prefix.expect("can't have a prefix without a value"); res.common_prefixes.push(Path::parse(prefix)?); } @@ -430,8 +462,8 @@ fn convert_object_meta(object: rusoto_s3::Object, bucket: &str) -> Result Utc::now(), }; - let size = - usize::try_from(object.size.unwrap_or(0)).expect("unsupported size on this platform"); + let size = usize::try_from(object.size.unwrap_or(0)) + .expect("unsupported size on this platform"); Ok(ObjectMeta { location, @@ -494,16 +526,26 @@ pub fn new_s3( rusoto_s3::S3Client::new_with(http_client, credentials_provider, region) } (Some(access_key_id), Some(secret_access_key), None) => { - let credentials_provider = - StaticProvider::new_minimal(access_key_id.into(), secret_access_key.into()); + let credentials_provider = StaticProvider::new_minimal( + access_key_id.into(), + secret_access_key.into(), + ); rusoto_s3::S3Client::new_with(http_client, credentials_provider, region) } (None, Some(_), _) => return Err(Error::MissingAccessKey.into()), (Some(_), None, _) => return Err(Error::MissingSecretAccessKey.into()), _ if std::env::var_os("AWS_WEB_IDENTITY_TOKEN_FILE").is_some() => { - rusoto_s3::S3Client::new_with(http_client, WebIdentityProvider::from_k8s_env(), region) + rusoto_s3::S3Client::new_with( + http_client, + WebIdentityProvider::from_k8s_env(), + region, + ) } - _ => rusoto_s3::S3Client::new_with(http_client, InstanceMetadataProvider::new(), region), + _ => rusoto_s3::S3Client::new_with( + http_client, + InstanceMetadataProvider::new(), + region, + ), }; Ok(AmazonS3 { @@ -579,12 +621,12 @@ impl AmazonS3 { .get_object(get_request) .await .map_err(|e| match e { - rusoto_core::RusotoError::Service(rusoto_s3::GetObjectError::NoSuchKey(_)) => { - Error::NotFound { - path: key.clone(), - source: e.into(), - } - } + rusoto_core::RusotoError::Service( + rusoto_s3::GetObjectError::NoSuchKey(_), + ) => Error::NotFound { + path: key.clone(), + source: e.into(), + }, _ => Error::UnableToGetData { bucket: self.bucket_name.to_owned(), path: key.clone(), @@ -669,12 +711,13 @@ impl AmazonS3 { // `next_continuation_token`, and we're assuming that `next_continuation_token` // is only set when `is_truncated` is true (and therefore not // checking `is_truncated`). - let next_state = - if let Some(next_continuation_token) = &resp.next_continuation_token { - ListState::HasMore(next_continuation_token.to_string()) - } else { - ListState::Done - }; + let next_state = if let Some(next_continuation_token) = + &resp.next_continuation_token + { + ListState::HasMore(next_continuation_token.to_string()) + } else { + ListState::Done + }; Some((Ok(resp), next_state)) } @@ -705,7 +748,9 @@ impl AmazonS3 { /// this function will return the last encountered error. /// /// Client errors (4xx) will never be retried by this function. -async fn s3_request(future_factory: F) -> Result> +async fn s3_request( + future_factory: F, +) -> Result> where E: std::error::Error + Send, F: Fn() -> G + Send, diff --git a/object_store/src/azure.rs b/object_store/src/azure.rs index 74b947d3d3b4..5f4327982cd0 100644 --- a/object_store/src/azure.rs +++ b/object_store/src/azure.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + //! An object store implementation for Azure blob storage use crate::{ path::{Path, DELIMITER}, @@ -116,7 +133,11 @@ enum Error { to: String, }, - #[snafu(display("Unable parse source url. Container: {}, Error: {}", container, source))] + #[snafu(display( + "Unable parse source url. Container: {}, Error: {}", + container, + source + ))] UnableToParseUrl { source: url::ParseError, container: String, @@ -224,7 +245,11 @@ impl ObjectStore for MicrosoftAzure { )) } - async fn get_range(&self, location: &Path, range: std::ops::Range) -> Result { + async fn get_range( + &self, + location: &Path, + range: std::ops::Range, + ) -> Result { let blob = self .container_client .as_blob_client(location.as_ref()) @@ -291,7 +316,10 @@ impl ObjectStore for MicrosoftAzure { Ok(()) } - async fn list(&self, prefix: Option<&Path>) -> Result>> { + async fn list( + &self, + prefix: Option<&Path>, + ) -> Result>> { let stream = self .list_impl(prefix, false) .await? @@ -500,7 +528,11 @@ pub fn new_azure( } else { ( false, - StorageAccountClient::new_access_key(Arc::clone(&http_client), &account, &access_key), + StorageAccountClient::new_access_key( + Arc::clone(&http_client), + &account, + &access_key, + ), ) }; @@ -583,7 +615,8 @@ mod tests { return; } else { AzureConfig { - storage_account: env::var("AZURE_STORAGE_ACCOUNT").unwrap_or_default(), + storage_account: env::var("AZURE_STORAGE_ACCOUNT") + .unwrap_or_default(), access_key: env::var("AZURE_STORAGE_ACCESS_KEY").unwrap_or_default(), bucket: env::var("OBJECT_STORE_BUCKET") .expect("already checked OBJECT_STORE_BUCKET"), diff --git a/object_store/src/gcp.rs b/object_store/src/gcp.rs index b36838043273..84fb572bd77b 100644 --- a/object_store/src/gcp.rs +++ b/object_store/src/gcp.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + //! An object store implementation for Google Cloud Storage use std::collections::BTreeSet; use std::fs::File; @@ -157,7 +174,8 @@ impl GoogleCloudStorage { } fn object_url(&self, path: &Path) -> String { - let encoded = percent_encoding::utf8_percent_encode(path.as_ref(), NON_ALPHANUMERIC); + let encoded = + percent_encoding::utf8_percent_encode(path.as_ref(), NON_ALPHANUMERIC); format!( "{}/storage/v1/b/{}/o/{}", self.base_url, self.bucket_name_encoded, encoded @@ -247,14 +265,25 @@ impl GoogleCloudStorage { } /// Perform a copy request - async fn copy_request(&self, from: &Path, to: &Path, if_not_exists: bool) -> Result<()> { + async fn copy_request( + &self, + from: &Path, + to: &Path, + if_not_exists: bool, + ) -> Result<()> { let token = self.get_token().await?; - let source = percent_encoding::utf8_percent_encode(from.as_ref(), NON_ALPHANUMERIC); - let destination = percent_encoding::utf8_percent_encode(to.as_ref(), NON_ALPHANUMERIC); + let source = + percent_encoding::utf8_percent_encode(from.as_ref(), NON_ALPHANUMERIC); + let destination = + percent_encoding::utf8_percent_encode(to.as_ref(), NON_ALPHANUMERIC); let url = format!( "{}/storage/v1/b/{}/o/{}/copyTo/b/{}/o/{}", - self.base_url, self.bucket_name_encoded, source, self.bucket_name_encoded, destination + self.base_url, + self.bucket_name_encoded, + source, + self.bucket_name_encoded, + destination ); let mut builder = self.client.request(Method::POST, url); @@ -410,10 +439,17 @@ impl ObjectStore for GoogleCloudStorage { self.delete_request(location).await } - async fn list(&self, prefix: Option<&Path>) -> Result>> { + async fn list( + &self, + prefix: Option<&Path>, + ) -> Result>> { let stream = self .list_paginated(prefix, false)? - .map_ok(|r| futures::stream::iter(r.items.into_iter().map(|x| convert_object_meta(&x)))) + .map_ok(|r| { + futures::stream::iter( + r.items.into_iter().map(|x| convert_object_meta(&x)), + ) + }) .try_flatten() .boxed(); @@ -486,7 +522,8 @@ pub fn new_gcs( .transpose()?; let bucket_name = bucket_name.into(); - let encoded_bucket_name = percent_encode(bucket_name.as_bytes(), NON_ALPHANUMERIC).to_string(); + let encoded_bucket_name = + percent_encode(bucket_name.as_bytes(), NON_ALPHANUMERIC).to_string(); // The cloud storage crate currently only supports authentication via // environment variables. Set the environment variable explicitly so @@ -674,7 +711,9 @@ mod test { .unwrap_err() .to_string(); assert!( - err.contains("Error performing put request: HTTP status client error (404 Not Found)"), + err.contains( + "Error performing put request: HTTP status client error (404 Not Found)" + ), "{}", err ) diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index f95eccdc6945..7a42fe384cd7 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + #![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)] #![warn( missing_copy_implementations, @@ -78,7 +95,10 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// /// Prefixes are evaluated on a path segment basis, i.e. `foo/bar/` is a prefix of `foo/bar/x` but not of /// `foo/bar_baz/x`. - async fn list(&self, prefix: Option<&Path>) -> Result>>; + async fn list( + &self, + prefix: Option<&Path>, + ) -> Result>>; /// List objects with the given prefix and an implementation specific /// delimiter. Returns common prefixes (directories) in addition to object @@ -166,22 +186,24 @@ impl GetResult { match self { Self::File(mut file, path) => { maybe_spawn_blocking(move || { - let len = file - .seek(SeekFrom::End(0)) - .map_err(|source| local::Error::Seek { + let len = file.seek(SeekFrom::End(0)).map_err(|source| { + local::Error::Seek { source, path: path.clone(), - })?; + } + })?; - file.seek(SeekFrom::Start(0)) - .map_err(|source| local::Error::Seek { + file.seek(SeekFrom::Start(0)).map_err(|source| { + local::Error::Seek { source, path: path.clone(), - })?; + } + })?; let mut buffer = Vec::with_capacity(len as usize); - file.read_to_end(&mut buffer) - .map_err(|source| local::Error::UnableToReadBytes { source, path })?; + file.read_to_end(&mut buffer).map_err(|source| { + local::Error::UnableToReadBytes { source, path } + })?; Ok(buffer.into()) }) @@ -210,25 +232,28 @@ impl GetResult { Self::File(file, path) => { const CHUNK_SIZE: usize = 8 * 1024; - futures::stream::try_unfold((file, path, false), |(mut file, path, finished)| { - maybe_spawn_blocking(move || { - if finished { - return Ok(None); - } - - let mut buffer = Vec::with_capacity(CHUNK_SIZE); - let read = file - .by_ref() - .take(CHUNK_SIZE as u64) - .read_to_end(&mut buffer) - .map_err(|e| local::Error::UnableToReadBytes { - source: e, - path: path.clone(), - })?; - - Ok(Some((buffer.into(), (file, path, read != CHUNK_SIZE)))) - }) - }) + futures::stream::try_unfold( + (file, path, false), + |(mut file, path, finished)| { + maybe_spawn_blocking(move || { + if finished { + return Ok(None); + } + + let mut buffer = Vec::with_capacity(CHUNK_SIZE); + let read = file + .by_ref() + .take(CHUNK_SIZE as u64) + .read_to_end(&mut buffer) + .map_err(|e| local::Error::UnableToReadBytes { + source: e, + path: path.clone(), + })?; + + Ok(Some((buffer.into(), (file, path, read != CHUNK_SIZE)))) + }) + }, + ) .boxed() } Self::Stream(s) => s, @@ -472,7 +497,9 @@ mod tests { Ok(()) } - pub(crate) async fn list_uses_directories_correctly(storage: &DynObjectStore) -> Result<()> { + pub(crate) async fn list_uses_directories_correctly( + storage: &DynObjectStore, + ) -> Result<()> { delete_fixtures(storage).await; let content_list = flatten_list_stream(storage, None).await?; @@ -576,7 +603,8 @@ mod tests { storage: &DynObjectStore, location: Option, ) -> crate::Result { - let location = location.unwrap_or_else(|| Path::from("this_file_should_not_exist")); + let location = + location.unwrap_or_else(|| Path::from("this_file_should_not_exist")); let err = storage.head(&location).await.unwrap_err(); assert!(matches!(err, crate::Error::NotFound { .. })); diff --git a/object_store/src/local.rs b/object_store/src/local.rs index b60dcb425d5b..8dc78cd63e11 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + //! An object store implementation for a local filesystem use crate::{ maybe_spawn_blocking, @@ -227,7 +244,9 @@ impl ObjectStore for LocalFileSystem { match File::create(&path) { Ok(f) => f, - Err(err) => return Err(Error::UnableToCreateFile { path, err }.into()), + Err(err) => { + return Err(Error::UnableToCreateFile { path, err }.into()) + } } } Err(err) => return Err(Error::UnableToCreateFile { path, err }.into()), @@ -284,10 +303,11 @@ impl ObjectStore for LocalFileSystem { maybe_spawn_blocking(move || { let file = open_file(&path)?; - let metadata = file.metadata().map_err(|e| Error::UnableToAccessMetadata { - source: e.into(), - path: location.to_string(), - })?; + let metadata = + file.metadata().map_err(|e| Error::UnableToAccessMetadata { + source: e.into(), + path: location.to_string(), + })?; convert_metadata(metadata, location) }) @@ -303,7 +323,10 @@ impl ObjectStore for LocalFileSystem { .await } - async fn list(&self, prefix: Option<&Path>) -> Result>> { + async fn list( + &self, + prefix: Option<&Path>, + ) -> Result>> { let config = Arc::clone(&self.config); let root_path = match prefix { @@ -315,19 +338,18 @@ impl ObjectStore for LocalFileSystem { // Don't include the root directory itself .min_depth(1); - let s = - walkdir.into_iter().flat_map(move |result_dir_entry| { - match convert_walkdir_result(result_dir_entry) { - Err(e) => Some(Err(e)), - Ok(None) => None, - Ok(entry @ Some(_)) => entry - .filter(|dir_entry| dir_entry.file_type().is_file()) - .map(|entry| { - let location = config.filesystem_to_path(entry.path())?; - convert_entry(entry, location) - }), - } - }); + let s = walkdir.into_iter().flat_map(move |result_dir_entry| { + match convert_walkdir_result(result_dir_entry) { + Err(e) => Some(Err(e)), + Ok(None) => None, + Ok(entry @ Some(_)) => entry + .filter(|dir_entry| dir_entry.file_type().is_file()) + .map(|entry| { + let location = config.filesystem_to_path(entry.path())?; + convert_entry(entry, location) + }), + } + }); // If no tokio context, return iterator directly as no // need to perform chunked spawn_blocking reads @@ -339,26 +361,27 @@ impl ObjectStore for LocalFileSystem { const CHUNK_SIZE: usize = 1024; let buffer = VecDeque::with_capacity(CHUNK_SIZE); - let stream = futures::stream::try_unfold((s, buffer), |(mut s, mut buffer)| async move { - if buffer.is_empty() { - (s, buffer) = tokio::task::spawn_blocking(move || { - for _ in 0..CHUNK_SIZE { - match s.next() { - Some(r) => buffer.push_back(r), - None => break, + let stream = + futures::stream::try_unfold((s, buffer), |(mut s, mut buffer)| async move { + if buffer.is_empty() { + (s, buffer) = tokio::task::spawn_blocking(move || { + for _ in 0..CHUNK_SIZE { + match s.next() { + Some(r) => buffer.push_back(r), + None => break, + } } - } - (s, buffer) - }) - .await?; - } + (s, buffer) + }) + .await?; + } - match buffer.pop_front() { - Some(Err(e)) => Err(e), - Some(Ok(meta)) => Ok(Some((meta, (s, buffer)))), - None => Ok(None), - } - }); + match buffer.pop_front() { + Some(Err(e)) => Err(e), + Some(Ok(meta)) => Ok(Some((meta, (s, buffer)))), + None => Ok(None), + } + }); Ok(stream.boxed()) } diff --git a/object_store/src/memory.rs b/object_store/src/memory.rs index dcee1b330876..ffd8e3a5207d 100644 --- a/object_store/src/memory.rs +++ b/object_store/src/memory.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + //! An in-memory object store implementation use crate::{path::Path, GetResult, ListResult, ObjectMeta, ObjectStore, Result}; use async_trait::async_trait; @@ -97,7 +114,10 @@ impl ObjectStore for InMemory { Ok(()) } - async fn list(&self, prefix: Option<&Path>) -> Result>> { + async fn list( + &self, + prefix: Option<&Path>, + ) -> Result>> { let last_modified = Utc::now(); let storage = self.storage.read(); diff --git a/object_store/src/oauth.rs b/object_store/src/oauth.rs index 173986ea617e..273e37b64922 100644 --- a/object_store/src/oauth.rs +++ b/object_store/src/oauth.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use crate::token::TemporaryToken; use reqwest::{Client, Method}; use ring::signature::RsaKeyPair; diff --git a/object_store/src/path/mod.rs b/object_store/src/path/mod.rs index 5e045aae7c4f..23488ef660c5 100644 --- a/object_store/src/path/mod.rs +++ b/object_store/src/path/mod.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + //! Path abstraction for Object Storage use itertools::Itertools; @@ -148,7 +165,9 @@ impl Path { /// /// This will return an error if the path does not exist, or contains illegal /// character sequences as defined by [`Path::parse`] - pub fn from_filesystem_path(path: impl AsRef) -> Result { + pub fn from_filesystem_path( + path: impl AsRef, + ) -> Result { Self::from_filesystem_path_with_base(path, None) } @@ -163,14 +182,12 @@ impl Path { ) -> Result { let url = filesystem_path_to_url(path)?; let path = match base { - Some(prefix) => { - url.path() - .strip_prefix(prefix.path()) - .ok_or_else(|| Error::PrefixMismatch { - path: url.path().to_string(), - prefix: prefix.to_string(), - })? - } + Some(prefix) => url.path().strip_prefix(prefix.path()).ok_or_else(|| { + Error::PrefixMismatch { + path: url.path().to_string(), + prefix: prefix.to_string(), + } + })?, None => url.path(), }; @@ -197,16 +214,23 @@ impl Path { /// Returns an iterator of the [`PathPart`] of this [`Path`] after `prefix` /// /// Returns `None` if the prefix does not match - pub fn prefix_match(&self, prefix: &Self) -> Option> + '_> { + pub fn prefix_match( + &self, + prefix: &Self, + ) -> Option> + '_> { let diff = itertools::diff_with(self.parts(), prefix.parts(), |a, b| a == b); match diff { // Both were equal None => Some(itertools::Either::Left(std::iter::empty())), // Mismatch or prefix was longer => None - Some(itertools::Diff::FirstMismatch(_, _, _) | itertools::Diff::Longer(_, _)) => None, + Some( + itertools::Diff::FirstMismatch(_, _, _) | itertools::Diff::Longer(_, _), + ) => None, // Match with remaining - Some(itertools::Diff::Shorter(_, back)) => Some(itertools::Either::Right(back)), + Some(itertools::Diff::Shorter(_, back)) => { + Some(itertools::Either::Right(back)) + } } } @@ -273,7 +297,9 @@ where /// Given a filesystem path, convert it to its canonical URL representation, /// returning an error if the file doesn't exist on the local filesystem -pub(crate) fn filesystem_path_to_url(path: impl AsRef) -> Result { +pub(crate) fn filesystem_path_to_url( + path: impl AsRef, +) -> Result { let path = path.as_ref().canonicalize().context(CanonicalizeSnafu { path: path.as_ref(), })?; @@ -457,7 +483,8 @@ mod tests { #[test] fn prefix_matches_with_file_name() { - let haystack = Path::from_iter(["foo/bar", "baz%2Ftest", "something", "foo.segment"]); + let haystack = + Path::from_iter(["foo/bar", "baz%2Ftest", "something", "foo.segment"]); // All directories match and file name is a prefix let needle = Path::from_iter(["foo/bar", "baz%2Ftest", "something", "foo"]); diff --git a/object_store/src/path/parts.rs b/object_store/src/path/parts.rs index 1ecc5dcdd286..e73b184fc15c 100644 --- a/object_store/src/path/parts.rs +++ b/object_store/src/path/parts.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use percent_encoding::{percent_decode, percent_encode, AsciiSet, CONTROLS}; use std::borrow::Cow; diff --git a/object_store/src/throttle.rs b/object_store/src/throttle.rs index 2b4087480ef3..493ab6e933b5 100644 --- a/object_store/src/throttle.rs +++ b/object_store/src/throttle.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + //! A throttling object store wrapper use parking_lot::Mutex; use std::ops::Range; @@ -182,7 +199,10 @@ impl ObjectStore for ThrottledStore { self.inner.delete(location).await } - async fn list(&self, prefix: Option<&Path>) -> Result>> { + async fn list( + &self, + prefix: Option<&Path>, + ) -> Result>> { sleep(self.config().wait_list_per_call).await; // need to copy to avoid moving / referencing `self` @@ -209,7 +229,8 @@ impl ObjectStore for ThrottledStore { match self.inner.list_with_delimiter(prefix).await { Ok(list_result) => { let entries_len = usize_to_u32_saturate(list_result.objects.len()); - sleep(self.config().wait_list_with_delimiter_per_entry * entries_len).await; + sleep(self.config().wait_list_with_delimiter_per_entry * entries_len) + .await; Ok(list_result) } Err(err) => Err(err), @@ -385,7 +406,10 @@ mod tests { assert_bounds!(measure_put(&store, 0).await, 0); } - async fn place_test_object(store: &ThrottledStore, n_bytes: Option) -> Path { + async fn place_test_object( + store: &ThrottledStore, + n_bytes: Option, + ) -> Path { let path = Path::from("foo"); if let Some(n_bytes) = n_bytes { @@ -400,7 +424,10 @@ mod tests { path } - async fn place_test_objects(store: &ThrottledStore, n_entries: usize) -> Path { + async fn place_test_objects( + store: &ThrottledStore, + n_entries: usize, + ) -> Path { let prefix = Path::from("foo"); // clean up store @@ -427,7 +454,10 @@ mod tests { prefix } - async fn measure_delete(store: &ThrottledStore, n_bytes: Option) -> Duration { + async fn measure_delete( + store: &ThrottledStore, + n_bytes: Option, + ) -> Duration { let path = place_test_object(store, n_bytes).await; let t0 = Instant::now(); @@ -436,7 +466,10 @@ mod tests { t0.elapsed() } - async fn measure_get(store: &ThrottledStore, n_bytes: Option) -> Duration { + async fn measure_get( + store: &ThrottledStore, + n_bytes: Option, + ) -> Duration { let path = place_test_object(store, n_bytes).await; let t0 = Instant::now(); @@ -459,7 +492,10 @@ mod tests { t0.elapsed() } - async fn measure_list(store: &ThrottledStore, n_entries: usize) -> Duration { + async fn measure_list( + store: &ThrottledStore, + n_entries: usize, + ) -> Duration { let prefix = place_test_objects(store, n_entries).await; let t0 = Instant::now(); diff --git a/object_store/src/token.rs b/object_store/src/token.rs index 4e7aa84c54bf..a56a29462b14 100644 --- a/object_store/src/token.rs +++ b/object_store/src/token.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use std::future::Future; use std::time::Instant; use tokio::sync::Mutex; diff --git a/object_store/src/util.rs b/object_store/src/util.rs index 7671da3d84c7..4f3ed86fdc69 100644 --- a/object_store/src/util.rs +++ b/object_store/src/util.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + //! Common logic for interacting with remote object stores use super::Result; use bytes::Bytes; From ed0f926612adafe40ff0e0fef0eee29dd9e03418 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 20 Jul 2022 15:52:35 -0400 Subject: [PATCH 10/12] ignore bubble_up_io_errors test --- object_store/src/local.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/object_store/src/local.rs b/object_store/src/local.rs index 8dc78cd63e11..b16e4848ef78 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -623,6 +623,8 @@ mod tests { #[tokio::test] #[cfg(target_family = "unix")] + // Fails on github actions runner (which runs the tests as root) + #[ignore] async fn bubble_up_io_errors() { use std::{fs::set_permissions, os::unix::prelude::PermissionsExt}; From eb77b53941728fadccd39b70d064a54a4d2d774d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 21 Jul 2022 17:17:29 -0400 Subject: [PATCH 11/12] Fix list_store with explicit lifetime, only run `test_list_root` on linux --- object_store/src/lib.rs | 4 ++-- object_store/src/local.rs | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 7a42fe384cd7..4a56b03bfb72 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -684,9 +684,9 @@ mod tests { } /// Test that the returned stream does not borrow the lifetime of Path - async fn list_store<'a>( + async fn list_store<'a, 'b>( store: &'a dyn ObjectStore, - path_str: &str, + path_str: &'b str, ) -> super::Result>> { let path = Path::from(path_str); store.list(Some(&path)).await diff --git a/object_store/src/local.rs b/object_store/src/local.rs index b16e4848ef78..8a9462eba9bf 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -703,6 +703,8 @@ mod tests { } #[tokio::test] + #[cfg(target_os = "linux")] + // macos has some magic in its root '/.VolumeIcon.icns"' which causes this test to fail async fn test_list_root() { let integration = LocalFileSystem::new(); let result = integration.list_with_delimiter(None).await; From 01f3908d043bc8f54fa883e2926635e322b7674a Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 22 Jul 2022 13:42:15 -0400 Subject: [PATCH 12/12] Only run object_store throttle tests on a mac --- object_store/src/throttle.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/object_store/src/throttle.rs b/object_store/src/throttle.rs index 493ab6e933b5..7a54a06c983b 100644 --- a/object_store/src/throttle.rs +++ b/object_store/src/throttle.rs @@ -314,6 +314,8 @@ mod tests { } #[tokio::test] + // macos github runner is so slow it can't complete within WAIT_TIME*2 + #[cfg(target_os = "linux")] async fn get_test() { let inner = InMemory::new(); let store = ThrottledStore::new(inner, ThrottleConfig::default()); @@ -341,6 +343,8 @@ mod tests { } #[tokio::test] + // macos github runner is so slow it can't complete within WAIT_TIME*2 + #[cfg(target_os = "linux")] async fn list_test() { let inner = InMemory::new(); let store = ThrottledStore::new(inner, ThrottleConfig::default()); @@ -366,6 +370,8 @@ mod tests { } #[tokio::test] + // macos github runner is so slow it can't complete within WAIT_TIME*2 + #[cfg(target_os = "linux")] async fn list_with_delimiter_test() { let inner = InMemory::new(); let store = ThrottledStore::new(inner, ThrottleConfig::default());