From b3fdcef0078a4fca15a9f64bd4383b365d95e778 Mon Sep 17 00:00:00 2001 From: Ravind Kumar Date: Fri, 12 May 2023 12:19:45 -0400 Subject: [PATCH] DOCS-763: Update pool, availability, resiliency, and erasure code docs (#843) --- source/_static/scss/includes/_misc.scss | 19 ++ source/glossary.rst | 6 + .../architecture-4-node-deploy.svg | 1 + ...chitecture-erasure-set-retrieve-object.svg | 1 + .../architecture-erasure-set-shard.svg | 1 + .../architecture-load-balancer-8-node.svg | 1 + .../architecture-load-balancer-multi-pool.svg | 1 + ...cture-load-balancer-multi-site-healing.svg | 1 + .../architecture-load-balancer-multi-site.svg | 1 + .../architecture-multi-site-latency.svg | 1 + .../architecture-multi-site-setup.svg | 1 + .../architecture/architecture-multi-site.svg | 1 + .../architecture-multiple-clients.svg | 1 + .../architecture-one-node-DAS.svg | 1 + .../availability-erasure-set-failure.svg | 1 + .../availability-erasure-set-selection.svg | 1 + ...lability-erasure-sharding-degraded-set.svg | 1 + ...bility-erasure-sharding-degraded-write.svg | 1 + ...availability-erasure-sharding-degraded.svg | 1 + ...ilability-erasure-sharding-split-brain.svg | 1 + .../availability-erasure-sharding-striped.svg | 1 + .../availability-erasure-sharding.svg | 1 + .../availability-pool-failure.svg | 1 + source/includes/common/installation.rst | 4 + source/operations/checklists.rst | 2 +- source/operations/concepts.rst | 9 +- source/operations/concepts/architecture.rst | 188 ++++++++++++++++++ .../concepts/availability-and-resiliency.rst | 157 +++++++++++++++ source/operations/concepts/erasure-coding.rst | 21 +- .../expand-minio-deployment.rst | 22 +- 30 files changed, 421 insertions(+), 28 deletions(-) create mode 100644 source/images/architecture/architecture-4-node-deploy.svg create mode 100644 source/images/architecture/architecture-erasure-set-retrieve-object.svg create mode 100644 source/images/architecture/architecture-erasure-set-shard.svg create mode 100644 source/images/architecture/architecture-load-balancer-8-node.svg create mode 100644 source/images/architecture/architecture-load-balancer-multi-pool.svg create mode 100644 source/images/architecture/architecture-load-balancer-multi-site-healing.svg create mode 100644 source/images/architecture/architecture-load-balancer-multi-site.svg create mode 100644 source/images/architecture/architecture-multi-site-latency.svg create mode 100644 source/images/architecture/architecture-multi-site-setup.svg create mode 100644 source/images/architecture/architecture-multi-site.svg create mode 100644 source/images/architecture/architecture-multiple-clients.svg create mode 100644 source/images/architecture/architecture-one-node-DAS.svg create mode 100644 source/images/availability/availability-erasure-set-failure.svg create mode 100644 source/images/availability/availability-erasure-set-selection.svg create mode 100644 source/images/availability/availability-erasure-sharding-degraded-set.svg create mode 100644 source/images/availability/availability-erasure-sharding-degraded-write.svg create mode 100644 source/images/availability/availability-erasure-sharding-degraded.svg create mode 100644 source/images/availability/availability-erasure-sharding-split-brain.svg create mode 100644 source/images/availability/availability-erasure-sharding-striped.svg create mode 100644 source/images/availability/availability-erasure-sharding.svg create mode 100644 source/images/availability/availability-pool-failure.svg create mode 100644 source/operations/concepts/architecture.rst create mode 100644 source/operations/concepts/availability-and-resiliency.rst diff --git a/source/_static/scss/includes/_misc.scss b/source/_static/scss/includes/_misc.scss index d7e8a80b..fa859a85 100644 --- a/source/_static/scss/includes/_misc.scss +++ b/source/_static/scss/includes/_misc.scss @@ -358,3 +358,22 @@ button.copybtn { transform: rotate(180deg) translateY(50%); } } + +// ---------------------------------- +// Cleaning up captions for figures +// ---------------------------------- + +figure { + box-shadow: 0px 0px 10px 5px lightgrey; + margin-bottom: 2.5rem; +} + +figcaption { + + border-top: lightgrey solid 1px; + + & span.caption-text { + font-size: small; + font-weight: bold; + } +} \ No newline at end of file diff --git a/source/glossary.rst b/source/glossary.rst index 00270551..573a56db 100644 --- a/source/glossary.rst +++ b/source/glossary.rst @@ -239,6 +239,12 @@ Glossary Renamed to :term:`access keys`. A MinIO deployment or tenant user account with limited account typically used with API calls. + shard + shards + A portion of an object after being :term:`erasure coded ` by MinIO. + Each "shard" represents either data or parity for MinIO to use for reconstructing objects on read requests. + + For more detailed information, see :ref:`minio-erasure-coding`. single-node multi-drive SNMD diff --git a/source/images/architecture/architecture-4-node-deploy.svg b/source/images/architecture/architecture-4-node-deploy.svg new file mode 100644 index 00000000..d8d44825 --- /dev/null +++ b/source/images/architecture/architecture-4-node-deploy.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/source/images/architecture/architecture-erasure-set-retrieve-object.svg b/source/images/architecture/architecture-erasure-set-retrieve-object.svg new file mode 100644 index 00000000..eaee8897 --- /dev/null +++ b/source/images/architecture/architecture-erasure-set-retrieve-object.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/source/images/architecture/architecture-erasure-set-shard.svg b/source/images/architecture/architecture-erasure-set-shard.svg new file mode 100644 index 00000000..99373f14 --- /dev/null +++ b/source/images/architecture/architecture-erasure-set-shard.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/source/images/architecture/architecture-load-balancer-8-node.svg b/source/images/architecture/architecture-load-balancer-8-node.svg new file mode 100644 index 00000000..f4059c3a --- /dev/null +++ b/source/images/architecture/architecture-load-balancer-8-node.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/source/images/architecture/architecture-load-balancer-multi-pool.svg b/source/images/architecture/architecture-load-balancer-multi-pool.svg new file mode 100644 index 00000000..ccdbd4fc --- /dev/null +++ b/source/images/architecture/architecture-load-balancer-multi-pool.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/source/images/architecture/architecture-load-balancer-multi-site-healing.svg b/source/images/architecture/architecture-load-balancer-multi-site-healing.svg new file mode 100644 index 00000000..a3a7a287 --- /dev/null +++ b/source/images/architecture/architecture-load-balancer-multi-site-healing.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/source/images/architecture/architecture-load-balancer-multi-site.svg b/source/images/architecture/architecture-load-balancer-multi-site.svg new file mode 100644 index 00000000..1a1710b8 --- /dev/null +++ b/source/images/architecture/architecture-load-balancer-multi-site.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/source/images/architecture/architecture-multi-site-latency.svg b/source/images/architecture/architecture-multi-site-latency.svg new file mode 100644 index 00000000..0bdd154e --- /dev/null +++ b/source/images/architecture/architecture-multi-site-latency.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/source/images/architecture/architecture-multi-site-setup.svg b/source/images/architecture/architecture-multi-site-setup.svg new file mode 100644 index 00000000..12b4eeed --- /dev/null +++ b/source/images/architecture/architecture-multi-site-setup.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/source/images/architecture/architecture-multi-site.svg b/source/images/architecture/architecture-multi-site.svg new file mode 100644 index 00000000..64a174bc --- /dev/null +++ b/source/images/architecture/architecture-multi-site.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/source/images/architecture/architecture-multiple-clients.svg b/source/images/architecture/architecture-multiple-clients.svg new file mode 100644 index 00000000..b98fe1f9 --- /dev/null +++ b/source/images/architecture/architecture-multiple-clients.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/source/images/architecture/architecture-one-node-DAS.svg b/source/images/architecture/architecture-one-node-DAS.svg new file mode 100644 index 00000000..1b15e13d --- /dev/null +++ b/source/images/architecture/architecture-one-node-DAS.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/source/images/availability/availability-erasure-set-failure.svg b/source/images/availability/availability-erasure-set-failure.svg new file mode 100644 index 00000000..51d9b53d --- /dev/null +++ b/source/images/availability/availability-erasure-set-failure.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/source/images/availability/availability-erasure-set-selection.svg b/source/images/availability/availability-erasure-set-selection.svg new file mode 100644 index 00000000..92eb00d5 --- /dev/null +++ b/source/images/availability/availability-erasure-set-selection.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/source/images/availability/availability-erasure-sharding-degraded-set.svg b/source/images/availability/availability-erasure-sharding-degraded-set.svg new file mode 100644 index 00000000..cd109f33 --- /dev/null +++ b/source/images/availability/availability-erasure-sharding-degraded-set.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/source/images/availability/availability-erasure-sharding-degraded-write.svg b/source/images/availability/availability-erasure-sharding-degraded-write.svg new file mode 100644 index 00000000..7738f082 --- /dev/null +++ b/source/images/availability/availability-erasure-sharding-degraded-write.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/source/images/availability/availability-erasure-sharding-degraded.svg b/source/images/availability/availability-erasure-sharding-degraded.svg new file mode 100644 index 00000000..c7fe9d03 --- /dev/null +++ b/source/images/availability/availability-erasure-sharding-degraded.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/source/images/availability/availability-erasure-sharding-split-brain.svg b/source/images/availability/availability-erasure-sharding-split-brain.svg new file mode 100644 index 00000000..1a6616d1 --- /dev/null +++ b/source/images/availability/availability-erasure-sharding-split-brain.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/source/images/availability/availability-erasure-sharding-striped.svg b/source/images/availability/availability-erasure-sharding-striped.svg new file mode 100644 index 00000000..1a6616d1 --- /dev/null +++ b/source/images/availability/availability-erasure-sharding-striped.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/source/images/availability/availability-erasure-sharding.svg b/source/images/availability/availability-erasure-sharding.svg new file mode 100644 index 00000000..bad148b3 --- /dev/null +++ b/source/images/availability/availability-erasure-sharding.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/source/images/availability/availability-pool-failure.svg b/source/images/availability/availability-pool-failure.svg new file mode 100644 index 00000000..2f029892 --- /dev/null +++ b/source/images/availability/availability-pool-failure.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/source/includes/common/installation.rst b/source/includes/common/installation.rst index 732d4397..077c80c5 100644 --- a/source/includes/common/installation.rst +++ b/source/includes/common/installation.rst @@ -10,6 +10,10 @@ Install and Deploy MinIO :local: :depth: 1 +.. meta:: + :description: MinIO Deployment Topologies and Installation Instructions + :keywords: MinIO, Deploy, Architecture, Topology, Distributed, Replication, Install + .. container:: extlinks-video - `Installing and Running MinIO on Linux `__ diff --git a/source/operations/checklists.rst b/source/operations/checklists.rst index a7bb4f8f..8a41fa7b 100644 --- a/source/operations/checklists.rst +++ b/source/operations/checklists.rst @@ -20,7 +20,7 @@ Community Support is best-effort only and has no SLAs around responsiveness. Checklists: -- :ref:`Hardware checklists ` +- :ref:`Hardware Checklist ` - :ref:`Security Checklist ` - :ref:`Software Checklist ` - :ref:`Thresholds and Limits ` diff --git a/source/operations/concepts.rst b/source/operations/concepts.rst index 068f03d6..58a9a85f 100644 --- a/source/operations/concepts.rst +++ b/source/operations/concepts.rst @@ -60,8 +60,11 @@ While testing MinIO may only involve a single drive on a single computer, most p A server pool a set of :mc:`minio server` nodes that pool their drives and resources to support object storage write and retrieval requests. MinIO supports adding one or more server pools to existing MinIO deployments for horizontal expansion. -When MinIO has multiple server pools available, an individual object always writes to the same server pool. -If one server pool goes down, objects on other pools remain accessible. +When MinIO has multiple server pools available, an individual object always writes to the same erasure set in the same server pool. + +If one server pool goes down, MinIO halts I/O to all pools until the cluster resumes normal operations. +You must restore the pool to working operation to resume I/O to the deployment. +Objects written to other pools remain safe on disk while you perform repair operations. The :mc-cmd:`~minio server HOSTNAME` argument passed to the :mc:`minio server` command represents a Server Pool: @@ -93,7 +96,7 @@ Consider the command below that creates a cluster consisting of two Server Pools | Server Pool | -Within a cluster, MinIO always stores each unique object and all versions of that object on the same Server Pool. +Each server pool has one or more :ref:`erasure sets ` depending on the number of drives and nodes in the pool. MinIO strongly recommends production clusters consist of a *minimum* of 4 :mc:`minio server` nodes in a Server Pool for proper high availability and durability guarantees. diff --git a/source/operations/concepts/architecture.rst b/source/operations/concepts/architecture.rst new file mode 100644 index 00000000..8d624ba2 --- /dev/null +++ b/source/operations/concepts/architecture.rst @@ -0,0 +1,188 @@ +.. _minio-architecture: + +======================= +Deployment Architecture +======================= + +.. default-domain:: minio + +.. contents:: Table of Contents + :local: + :depth: 2 + +.. meta:: + :keywords: topology, architecture, deployment, production + :description: Information on MinIO Deployment architecture and topology in production environments + +This page provides an overview of MinIO deployment architectures from a production perspective. +For information on specific hardware or software configurations, see: + +- :ref:`Hardware Checklist ` +- :ref:`Security Checklist ` +- :ref:`Software Checklist ` +- :ref:`Thresholds and Limits ` + +Distributed MinIO Deployments +----------------------------- + +A production MinIO deployment consists of at least 4 MinIO hosts with homogeneous storage and compute resources. + MinIO aggregates these resources together as a :ref:`pool ` and presents itself as a single object storage service. + + .. figure:: /images/architecture/architecture-4-node-deploy.svg + :figwidth: 100% + :alt: 4 Node MinIO deployment with homogeneous storage and compute resources + :align: center + + Each MinIO host in this pool has matching compute, storage, and network configurations + +MinIO provides best performance when using direct-attached storage (DAS), such as NVMe or SSD drives attached to a PCI-E controller board on the host machine. + Storage controllers should present XFS-formatted drives in "Just a Bunch of Drives" (JBOD) configurations with no RAID, pooling, or other hardware/software resiliency layers. + MinIO recommends against caching, either at the drive or the controller layer. + Either type of caching can cause :abbr:`I/O (Input / Output)` spikes as the cache fills and clears, resulting in unpredictable performance. + + .. figure:: /images/architecture/architecture-one-node-DAS.svg + :figwidth: 100% + :alt: MinIO Server diagram of Direct-Attached Storage via SAS to a PCI-E Storage Controller + :align: center + + Each SSD connects by SAS to a PCI-E-attached storage controller operating in HBA mode + +MinIO automatically groups drives in the pool into :ref:`erasure sets `. + Erasure sets are the foundational component of MinIO :ref:`availability and resiliency `. + MinIO stripes erasure sets across the nodes in the pool to maintain even distribution of erasure set drives. + MinIO then shards objects into data and parity blocks based on the deployment :ref:`parity ` and distributes them across an erasure set. + + For a more complete discussion of MinIO redundancy and healing, see :ref:`minio-erasure-coding`. + + .. figure:: /images/architecture/architecture-erasure-set-shard.svg + :figwidth: 100% + :alt: Diagram of object being sharded into four data and four parity blocks, distributed across eight drives + :align: center + + With the default parity of ``EC:4``, MinIO shards the object into 4 data and 4 parity blocks, distributing them across the drives in the erasure set. + +MinIO uses a deterministic algorithm to select the erasure set for a given object. + For each unique object namespace ``BUCKET/PREFIX/[PREFIX/...]/OBJECT.EXTENSION``, MinIO always selects the same erasure set for read/write operations. + MinIO handles all routing within pools and erasure sets, making the select/read/write process entirely transparent to applications. + + .. figure:: /images/architecture/architecture-erasure-set-retrieve-object.svg + :figwidth: 100% + :alt: Diagram of object retrieval from only data shards + :align: center + + MinIO reconstructs objects from data or parity shards transparently before returning the object to the requesting client. + +Each MinIO server has a complete picture of the distributed topology, such that an application can connect and direct operations against any node in the deployment. + The MinIO responding node automatically handles routing internal requests to other nodes in the deployment *and* returning the final response to the client. + + Applications typically should not manage those connections, as any changes to the deployment topology would require application updates. + Production environments should instead deploy a load balancer or similar network control plane component to manage connections to the MinIO deployment. + For example, you can deploy an NGINX load balancer to perform "least connections" or "round robin" load balancing against the available nodes in the deployment. + + .. figure:: /images/architecture/architecture-load-balancer-8-node.svg + :figwidth: 100% + :alt: Diagram of an eight node MinIO deployment behind a load balancer + :align: center + + The load balancer routes the request to any node in the deployment. + The receiving node handles any internode requests thereafter. + +You can expand a MinIO deployment's available storage through :ref:`pool expansion `. + Each pool consists of an independent group of nodes with their own erasure sets. + MinIO must query each pool to determine the correct erasure set to which it directs read and write operations, such that each additional pool adds increased internode traffic per call. + The pool which contains the correct erasure set then responds to the operation, remaining entirely transparent to the application. + + If you modify the MinIO topology through pool expansion, you can update your applications by modifying the load balancer to include the new pool's nodes. + This ensures even distribution of requests across all pools, while applications continue using the single load balancer URL for MinIO operations. + + .. figure:: /images/architecture/architecture-load-balancer-multi-pool.svg + :figwidth: 100% + :alt: Diagram of a multi-pool minio deployment behind a load balancer + :align: center + + The PUT request requires checking each pool for the correct erasure set. + Once identified, MinIO partitions the object and distributes the data and parity shards across the appropriate set. + +Client applications can use any S3-compatible SDK or library to interact with the MinIO deployment. + MinIO publishes its own :ref:`drivers ` specifically intended for use with S3-compatible deployments. + Regardless of the driver, the S3 API uses HTTP methods like ``GET`` and ``POST`` for all operations. + Neither MinIO nor S3 implements proprietary wire protocols or other low-level interfaces for normal operations. + + .. figure:: /images/architecture/architecture-multiple-clients.svg + :figwidth: 100% + :alt: Diagram of multiple S3-compatible clients using SDKs to connect to MinIO + + Clients using a variety of S3-compatible SDKs can perform operations against the same MinIO deployment. + + MinIO uses a strict implementation of the S3 API, including requiring clients to sign all operations using AWS :s3-api:`Signature V4 ` or the legacy Signature V2. + AWS signature calculation uses the client-provided headers, such that any modification to those headers by load balancers, proxies, security programs, or other components can result in signature mismatch errors. + Ensure any such intermediate components support pass-through of unaltered headers from client to server. + + The complexity of signature calculation typically makes interfacing via ``curl`` or similar REST clients difficult or impractical. + MinIO recommends using S3-compatible drivers which perform the signature calculation automatically as part of operations. + +Replicated MinIO Deployments +---------------------------- + +MinIO :ref:`site replication ` provides support for synchronizing distinct independent deployments. + You can deploy peer sites in different racks, datacenters, or geographic regions to support functions like :abbr:`BC/DR (Business Continuity / Disaster Recovery)` or geo-local read/write performance in a globally distributed MinIO object store. + + .. figure:: /images/architecture/architecture-multi-site.svg + :figwidth: 100% + :alt: Diagram of a multi-site deployment with three MinIO peer site + + A MinIO multi-site deployment with three peers. + Write operations on one peer replicate to all other peers in the configuration automatically. + +Each peer site consists of an independent set of MinIO hosts, ideally having matching pool configurations. + The architecture of each peer site should closely match to ensure consistent performance and behavior between sites. + All peer sites must use the same primary identity provider, and during initial configuration only one peer site can have any data. + + .. figure:: /images/architecture/architecture-multi-site-setup.svg + :figwidth: 100% + :alt: Diagram of a multi-site deployment during initial setup + + The initial setup of a MinIO multi-site deployment. + The first peer site replicates all required information to other peers in the configuration. + Adding new peers uses the same sequence for synchronizing data. + +Replication performance primarily depends on the network latency between each peer site. + With geographically distributed peer sites, high latency between sites can result in significant replication lag. + This can compound with workloads that are near or at the deployment's overall performance capacity, as the replication process itself requires sufficient free :abbr:`I/O (Input / Output)` to synchronize objects. + + .. figure:: /images/architecture/architecture-multi-site-latency.svg + :figwidth: 100% + :alt: Diagram of a multi-site deployment with latency between sites + + In this peer configuration, the latency between Site A and its peer sites is 100ms. + The soonest the object fully synchronizes to all sites is at least 110ms. + +Deploying a global load balancer or similar network appliance with support for site-to-site failover protocols is critical to the functionality of multi-site deployments. + The load balancer should support a health probe/check setting to detect the failure of one site and automatically redirect applications to any remaining healthy peer. + + .. figure:: /images/architecture/architecture-load-balancer-multi-site.svg + :figwidth: 100% + :alt: Diagram of a multi-site deployment with a failed site + + One of the peer sites has failed completely. + The load balancer automatically routes requests to the remaining healthy peer site. + + The load balancer should meet the same requirements as single-site deployments regarding connection balancing and header preservation. + MinIO replication handles transient failures by queuing objects for replication. + +MinIO replication can automatically heal a site that has partial data loss due to transient or sustained downtime. + If a peer site completely fails, you can remove that site from the configuration entirely. + The load balancer configuration should also remove that site to avoid routing client requests to the offline site. + + You can then restore the peer site, either after repairing the original hardware or replacing it entirely, by adding it back to the site replication configuration. + MinIO automatically begins resynchronizing content. + + .. figure:: /images/architecture/architecture-load-balancer-multi-site-healing.svg + :figwidth: 100% + :alt: Diagram of a multi-site deployment with a healing site + + The peer site has recovered and reestablished connectivity with its healthy peers. + MinIO automatically works through the replication queue to catch the site back up. + + Once all data synchronizes, you can restore normal connectivity to that site. + Depending on the amount of replication lag, latency between sites and overall workload :abbr:`I/O (Input / Output)`, you may need to temporarily stop write operations to allow the sites to completely catch up. \ No newline at end of file diff --git a/source/operations/concepts/availability-and-resiliency.rst b/source/operations/concepts/availability-and-resiliency.rst new file mode 100644 index 00000000..92d8329e --- /dev/null +++ b/source/operations/concepts/availability-and-resiliency.rst @@ -0,0 +1,157 @@ +.. _minio_availability-resiliency: + +=========================== +Availability and Resiliency +=========================== + +.. default-domain:: minio + +.. contents:: Table of Contents + :local: + :depth: 2 + +.. meta:: + :keywords: availability, resiliency, healing, recovery, distributed + :description: Information on MinIO Availability and Resiliency features in production environments + +This page provides an overview of MinIO's availability and resiliency design and features from a production perspective. + +.. note:: + + The contents of this page are intended as a best-effort guide to understanding MinIO's intended design and philosophy behind availability and resiliency. + It cannot replace the functionality of |subnet|, which allows for coordinating with MinIO Engineering when planning your MinIO deployments. + + Community users can seek support on the `MinIO Community Slack `__. + Community Support is best-effort only and has no SLAs around responsiveness. + +MinIO implements :ref:`erasure coding ` as the core component in providing availability and resiliency during drive or node-level failure events. + MinIO partitions each object into data and :ref:`parity ` shards and distributes those shards across a single :ref:`erasure set `. + + .. figure:: /images/availability/availability-erasure-sharding.svg + :figwidth: 100% + :align: center + :alt: Diagram of erasure coded object partitioned into twelve data shards and four parity shards + + This small one-node deployment has 16 drives in one erasure set. + Assuming default :ref:`parity ` of ``EC:4``, MinIO shards the object into 4 (four) parity shards and 12 (twelve) data shards. + MinIO distributes these shards evenly across each drive in the erasure set. + +MinIO uses a deterministic algorithm to select the erasure set for a given object. + For each unique object namespace ``BUCKET/PREFIX/[PREFIX/...]/OBJECT.EXTENSION``, MinIO always selects the same erasure set for read/write operations. + This includes all :ref:`versions ` of that same object. + + .. figure:: /images/availability/availability-erasure-set-selection.svg + :figwidth: 100% + :align: center + :alt: Diagram of erasure set selection based on object namespace + + MinIO calculates the destination erasure set using the full object namespace. + +MinIO requires :ref:`read and write quorum ` to perform read and write operations against an erasure set. + The quorum depends on the configured parity for the deployment. + Read quorum always equals the configured parity, such that MinIO can perform read operations against any erasure set that has not lost more drives than parity. + + .. figure:: /images/availability/availability-erasure-sharding-degraded.svg + :figwidth: 100% + :align: center + :alt: Diagram of degraded erasure set, where two parity shards replace two data shards + + This node has two failed drives. + MinIO uses parity shards to replace the lost data shards automatically and serves the reconstructed object to the requesting client. + + With the default parity of ``EC:4``, the deployment can tolerate the loss of 4 (four) drives per erasure set and still serve read operations. + +Write quorum depends on the configured parity and the size of the erasure set. + If parity is less than 1/2 (half) the number of erasure set drives, write quorum equals parity and functions similarly to read quorum. + MinIO automatically "upgrades" the parity of objects written to a degraded erasure set to ensure that object can meet the same :abbr:`SLA (Service Level Agreement)` as objects in healthy erasure sets. + + .. figure:: /images/availability/availability-erasure-sharding-degraded-write.svg + :figwidth: 100% + :align: center + :alt: Diagram of degraded erasure set, where two drives have failed + + This node has two failed drives. + MinIO writes the object with an upgraded parity of ``EC:6`` to ensure this object meets the same SLA as other objects. + + With the default parity of ``EC:4``, the deployment can tolerate the loss of 4 drives per erasure set and still serve write operations. + MinIO can perform "parity upgrade" up to /2 the drives in the erasure set. + +If parity equals 1/2 (half) the number of erasure set drives, write quorum equals parity + 1 (one) to avoid data inconsistency due to "split brain" scenarios. + For example, if exactly half the drives in the erasure set become isolated due to a network fault, MinIO would consider quorum lost as it cannot establish a N+1 group of drives for the write operation. + + .. figure:: /images/availability/availability-erasure-sharding-split-brain.svg + :figwidth: 100% + :align: center + :alt: Diagram of erasure set where half the drives have failed + + This node has 50% drive failure. + If parity is ``EC:8``, this erasure set cannot meet write quorum and MinIO rejects write operations to that set. + Since the erasure set still maintains read quorum, read operations to existing objects can still succeed. + +An erasure set which loses more drives than the configured parity has suffered data loss. + For maximum parity configurations, the erasure set goes into "read only" mode if drive loss equals parity. + For the maximum erasure set size of 16 and maximum parity of 8, this would require the loss of 9 drives for data loss to occur. + + .. figure:: /images/availability/availability-erasure-sharding-degraded-set.svg + :figwidth: 100% + :align: center + :alt: Diagram of completely degraded erasure set + + This erasure set has lost more drives than the configured parity of ``EC:4`` and has therefore lost both read and write quorum. + MinIO cannot recover any data stored on this erasure set. + +MinIO further mitigates the risk of erasure set failure by "striping" erasure set drives across each node in the pool. + MinIO automatically calculates the optimal erasure set size based on the number of nodes and drives, where the maximum set size is 16 (sixteen). + It then selects one drive per node going across the pool for each erasure set, circling around if the erasure set stripe size is greater than the number of nodes. + This topology improves resiliency to the loss of a single node, or even a storage controller on that node. + + .. figure:: /images/availability/availability-erasure-sharding-striped.svg + :figwidth: 100% + :align: center + :alt: Diagram of a sixteen node by eight drive per node cluster, consisting of eight sixteen drive erasure sets striped evenly across each node. + + In this 16 x 8 deployment, MinIO would calculate 8 erasure sets of 16 drives each. + It allocates one drive per node across the available nodes to fill each erasure set. + If there were 8 nodes, MinIO would need to select 2 drives per node for each erasure set. + + In the above topology, the pool has 8 erasure sets of 16 drives each striped across 16 nodes. + Each node would have one drive allocated per erasure set. + While losing one node would technically result in the loss of 8 drives, each erasure set would only lose one drive each. + This maintains quorum despite the node downtime. + +Each erasure set is independent of all others in the same pool. + If one erasure set becomes completely degraded, MinIO can still perform read/write operations on other erasure sets. + + .. figure:: /images/availability/availability-erasure-set-failure.svg + :figwidth: 100% + :align: center + :alt: Diagram of a MinIO multi-pool deployment with one failed erasure set in a pool + + One pool has a degraded erasure set. + While MinIO can no longer serve read/write operations to that erasure set, it can continue to serve operations on healthy erasure sets in that pool. + + Since erasure sets are independent, you cannot restore data to a completely degraded erasure set using other erasure sets. + You must use :ref:`Site ` or :ref:`Bucket ` replication to create a :abbr:`BC/DR (Business Continuity / Disaster Recovery)`-ready remote deployment for restoring lost data. + +For multi-pool MinIO deployments, each pool requires at least one erasure set maintaining read/write quorum to continue performing operations. + If one pool loses all erasure sets, MinIO can no longer determine whether a given read/write operation would have routed to that pool. + MinIO therefore stops all I/O to the deployment, even if other pools remain operational. + + .. figure:: /images/availability/availability-pool-failure.svg + :figwidth: 100% + :align: center + :alt: Diagram of a MinIO multi-pool deployment with one failed pool. + + One pool in this deployment has completely failed. + MinIO can no longer determine which pool or erasure set to route I/O to. + Continued operations could produce an inconsistent state where an object and/or it's versions reside in different erasure sets. + MinIO therefore halts all :abbr:`I/O (Input/Output)` in the deployment until the pool recovers. + + To restore access to the deployment, administrators must restore the pool to normal operations. + This may require formatting disks, replacing hardware, or replacing nodes depending on the severity of the failure. + See :ref:`minio-restore-hardware-failure` for more complete documentation. + + Use replicated remotes to restore the lost data to the deployment. + All data stored on the healthy pools remain safe on disk. + + diff --git a/source/operations/concepts/erasure-coding.rst b/source/operations/concepts/erasure-coding.rst index a71faa73..adf2edac 100644 --- a/source/operations/concepts/erasure-coding.rst +++ b/source/operations/concepts/erasure-coding.rst @@ -13,9 +13,9 @@ Erasure Coding MinIO Erasure Coding is a data redundancy and availability feature that allows MinIO deployments to automatically reconstruct objects on-the-fly despite the loss of multiple drives or nodes in the cluster. Erasure Coding provides object-level healing with significantly less overhead than adjacent technologies such as RAID or replication. -MinIO splits each new object into data and parity blocks, where parity blocks support reconstruction of missing or corrupted data blocks. -MinIO writes these blocks to a single :ref:`erasure set ` in the deployment. -Since erasure set drives are striped across the server pool, a given node contains only a portion of data or parity blocks for each object. +MinIO partitions each new object into data and parity shards, where parity shards support reconstruction of missing or corrupted data shards. +MinIO writes these shards to a single :ref:`erasure set ` in the deployment. +Since erasure set drives are striped across the server pool, a given node contains only a portion of data or parity shards for each object. MinIO can therefore tolerate the loss of multiple drives or nodes in the deployment depending on the configured parity and deployment topology. .. image:: /images/erasure-code.jpg @@ -24,7 +24,7 @@ MinIO can therefore tolerate the loss of multiple drives or nodes in the deploym :align: center At maximum parity, MinIO can tolerate the loss of up to half the drives per erasure set (:math:`(N / 2) - 1`) and still perform read and write operations. -MinIO defaults to 4 parity blocks per object with tolerance for the loss of 4 drives per erasure set. +MinIO defaults to 4 parity shards per object with tolerance for the loss of 4 drives per erasure set. For more complete information on selecting erasure code parity, see :ref:`minio-ec-parity`. Use the MinIO `Erasure Code Calculator `__ when planning and designing your MinIO deployment to explore the effect of erasure code settings on your intended topology. @@ -35,7 +35,7 @@ Erasure Sets ------------ An *Erasure Set* is a group of drives onto which MinIO writes erasure coded objects. -MinIO randomly and uniformly distributes the data and parity blocks of a given object across the erasure set drives, where a given drive has no more than one block of either type per object (no overlap). +MinIO randomly and uniformly distributes the data and parity shards of a given object across the erasure set drives, where a given drive has no more than one block of either type per object (no overlap). MinIO automatically calculates the number and size of Erasure Sets ("stripe size") based on the total number of nodes and drives in the :ref:`Server Pool `, where the minimum stripe size is 2 and the maximum stripe size is 16. All erasure sets in a given pool have the same stripe size, and MinIO never modifies nor allows modification of stripe size after initial configuration. @@ -51,13 +51,16 @@ As a general guide, plan your topologies to have an even number of nodes and dri Erasure Code Parity (``EC:N``) ------------------------------ -MinIO uses a Reed-Solomon algorithm to split objects into data and parity blocks based on the :ref:`Erasure Set ` size in the deployment. -For a given erasure set of size ``M``, MinIO splits objects into ``N`` parity blocks and ``M-N`` data blocks. +MinIO uses a Reed-Solomon algorithm to split objects into data and parity shards based on the :ref:`Erasure Set ` size in the deployment. +For a given erasure set of size ``M``, MinIO splits objects into ``N`` parity shards and ``M-N`` data shards. -MinIO uses the ``EC:N`` notation to refer to the number of parity blocks (``N``) in the deployment. -MinIO defaults to ``EC:4`` or 4 parity blocks per object. +MinIO uses the ``EC:N`` notation to refer to the number of parity shards (``N``) in the deployment. +MinIO defaults to ``EC:4`` or 4 parity shards per object. MinIO uses the same ``EC:N`` value for all erasure sets and :ref:`server pools ` in the deployment. +.. _minio-read-quorum: +.. _minio-write-quorum: + MinIO can tolerate the loss of up to ``N`` drives per erasure set and continue performing read and write operations ("quorum"). If ``N`` is equal to exactly 1/2 the drives in the erasure set, MinIO write quorum requires :math:`N + 1` drives to avoid data inconsistency ("split-brain"). diff --git a/source/operations/install-deploy-manage/expand-minio-deployment.rst b/source/operations/install-deploy-manage/expand-minio-deployment.rst index 42c17e92..c8c1d87c 100644 --- a/source/operations/install-deploy-manage/expand-minio-deployment.rst +++ b/source/operations/install-deploy-manage/expand-minio-deployment.rst @@ -10,24 +10,16 @@ Expand a Distributed MinIO Deployment :local: :depth: 1 -A distributed MinIO deployment consists of 4 or more drives/volumes managed by -one or more :mc:`minio server` process, where the processes manage pooling the -compute and storage resources into a single aggregated object storage resource. -Each MinIO server has a complete picture of the distributed topology, such that -an application can connect to any node in the deployment and perform S3 -operations. +MinIO supports expanding an existing distributed deployment by adding a new :ref:`Server Pool `. +Each Pool expands the total available storage capacity of the cluster. -MinIO supports expanding an existing distributed deployment by adding a new -:ref:`Server Pool `. Each Pool expands the total -available storage capacity of the cluster while maintaining the overall -:ref:`availability ` of the cluster. Each Pool is its -own failure domain, where the loss of one or more drives or nodes in that pool -does not effect the availability of other pools in the deployment. +Expansion does not provide Business Continuity/Disaster Recovery (BC/DR)-grade protections. +While each pool is an independent set of servers with distinct :ref:`erasure sets ` for availability, the complete loss of one pool results in MinIO stopping I/O for all pools in the deployment. +Similarly, an erasure set which loses quorum in one pool represents data loss of objects stored in that set, regardless of the number of other erasure sets or pools. -The procedure on this page expands an existing -:ref:`distributed ` MinIO deployment with an -additional server pool. +To provide BC-DR grade failover and recovery support for your single or multi-pool MinIO deployments, use :ref:`site replication `. +The procedure on this page expands an existing :ref:`distributed ` MinIO deployment with an additional server pool. .. _expand-minio-distributed-prereqs: