/* * file.c * * file system operations * * Copyright (c) 2010-2016, PostgreSQL Global Development Group * src/bin/pg_upgrade/file.c */ #include "postgres_fe.h" #include "access/visibilitymap.h" #include "pg_upgrade.h" #include "storage/bufpage.h" #include "storage/checksum.h" #include "storage/checksum_impl.h" #include #include #define BITS_PER_HEAPBLOCK_OLD 1 #ifndef WIN32 static int copy_file(const char *fromfile, const char *tofile); #else static int win32_pghardlink(const char *src, const char *dst); #endif /* * copyFile() * * Copies a relation file from src to dst. */ const char * copyFile(const char *src, const char *dst) { #ifndef WIN32 if (copy_file(src, dst) == -1) #else if (CopyFile(src, dst, true) == 0) #endif return getErrorText(); else return NULL; } /* * linkFile() * * Creates a hard link between the given relation files. We use * this function to perform a true in-place update. If the on-disk * format of the new cluster is bit-for-bit compatible with the on-disk * format of the old cluster, we can simply link each relation * instead of copying the data from the old cluster to the new cluster. */ const char * linkFile(const char *src, const char *dst) { if (pg_link_file(src, dst) == -1) return getErrorText(); else return NULL; } #ifndef WIN32 static int copy_file(const char *srcfile, const char *dstfile) { #define COPY_BUF_SIZE (50 * BLCKSZ) int src_fd; int dest_fd; char *buffer; int ret = 0; int save_errno = 0; if ((srcfile == NULL) || (dstfile == NULL)) { errno = EINVAL; return -1; } if ((src_fd = open(srcfile, O_RDONLY, 0)) < 0) return -1; if ((dest_fd = open(dstfile, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR)) < 0) { save_errno = errno; if (src_fd != 0) close(src_fd); errno = save_errno; return -1; } buffer = (char *) pg_malloc(COPY_BUF_SIZE); /* perform data copying i.e read src source, write to destination */ while (true) { ssize_t nbytes = read(src_fd, buffer, COPY_BUF_SIZE); if (nbytes < 0) { save_errno = errno; ret = -1; break; } if (nbytes == 0) break; errno = 0; if (write(dest_fd, buffer, nbytes) != nbytes) { /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) errno = ENOSPC; save_errno = errno; ret = -1; break; } } pg_free(buffer); if (src_fd != 0) close(src_fd); if (dest_fd != 0) close(dest_fd); if (save_errno != 0) errno = save_errno; return ret; } #endif /* * rewriteVisibilityMap() * * In versions of PostgreSQL prior to catversion 201603011, PostgreSQL's * visibility map included one bit per heap page; it now includes two. * When upgrading a cluster from before that time to a current PostgreSQL * version, we could refuse to copy visibility maps from the old cluster * to the new cluster; the next VACUUM would recreate them, but at the * price of scanning the entire table. So, instead, we rewrite the old * visibility maps in the new format. That way, the all-visible bit * remains set for the pages for which it was set previously. The * all-frozen bit is never set by this conversion; we leave that to * VACUUM. */ const char * rewriteVisibilityMap(const char *fromfile, const char *tofile) { int src_fd = 0; int dst_fd = 0; char buffer[BLCKSZ]; ssize_t bytesRead; ssize_t totalBytesRead = 0; ssize_t src_filesize; int rewriteVmBytesPerPage; BlockNumber new_blkno = 0; struct stat statbuf; /* Compute we need how many old page bytes to rewrite a new page */ rewriteVmBytesPerPage = (BLCKSZ - SizeOfPageHeaderData) / 2; if ((fromfile == NULL) || (tofile == NULL)) return "Invalid old file or new file"; if ((src_fd = open(fromfile, O_RDONLY, 0)) < 0) return getErrorText(); if (fstat(src_fd, &statbuf) != 0) { close(src_fd); return getErrorText(); } if ((dst_fd = open(tofile, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR)) < 0) { close(src_fd); return getErrorText(); } /* Save old file size */ src_filesize = statbuf.st_size; /* * Turn each visibility map page into 2 pages one by one. Each new page * has the same page header as the old one. If the last section of last * page is empty, we skip it, mostly to avoid turning one-page visibility * maps for small relations into two pages needlessly. */ while (totalBytesRead < src_filesize) { char *old_cur; char *old_break; char *old_blkend; PageHeaderData pageheader; bool old_lastblk; if ((bytesRead = read(src_fd, buffer, BLCKSZ)) != BLCKSZ) { close(dst_fd); close(src_fd); return getErrorText(); } totalBytesRead += BLCKSZ; old_lastblk = (totalBytesRead == src_filesize); /* Save the page header data */ memcpy(&pageheader, buffer, SizeOfPageHeaderData); /* * These old_* variables point to old visibility map page. old_cur * points to current position on old page. old_blkend points to end of * old block. old_break points to old page break position for * rewriting a new page. After wrote a new page, old_break proceeds * rewriteVmBytesPerPage bytes. */ old_cur = buffer + SizeOfPageHeaderData; old_blkend = buffer + bytesRead; old_break = old_cur + rewriteVmBytesPerPage; while (old_blkend >= old_break) { char new_vmbuf[BLCKSZ]; char *new_cur = new_vmbuf; bool empty = true; bool old_lastpart; /* Copy page header in advance */ memcpy(new_vmbuf, &pageheader, SizeOfPageHeaderData); /* Rewrite the last part of the old page? */ old_lastpart = old_lastblk && (old_blkend == old_break); new_cur += SizeOfPageHeaderData; /* Process old page bytes one by one, and turn it into new page. */ while (old_break > old_cur) { uint16 new_vmbits = 0; int i; /* Generate new format bits while keeping old information */ for (i = 0; i < BITS_PER_BYTE; i++) { uint8 byte = *(uint8 *) old_cur; if (byte & (1 << (BITS_PER_HEAPBLOCK_OLD * i))) { empty = false; new_vmbits |= 1 << (BITS_PER_HEAPBLOCK * i); } } /* Copy new visibility map bit to new format page */ memcpy(new_cur, &new_vmbits, BITS_PER_HEAPBLOCK); old_cur += BITS_PER_HEAPBLOCK_OLD; new_cur += BITS_PER_HEAPBLOCK; } /* If the last part of the old page is empty, skip writing it */ if (old_lastpart && empty) break; /* Set new checksum for a visibility map page (if enabled) */ if (old_cluster.controldata.data_checksum_version != 0 && new_cluster.controldata.data_checksum_version != 0) ((PageHeader) new_vmbuf)->pd_checksum = pg_checksum_page(new_vmbuf, new_blkno); if (write(dst_fd, new_vmbuf, BLCKSZ) != BLCKSZ) { close(dst_fd); close(src_fd); return getErrorText(); } old_break += rewriteVmBytesPerPage; new_blkno++; } } /* Close files */ close(dst_fd); close(src_fd); return NULL; } void check_hard_link(void) { char existing_file[MAXPGPATH]; char new_link_file[MAXPGPATH]; snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION", old_cluster.pgdata); snprintf(new_link_file, sizeof(new_link_file), "%s/PG_VERSION.linktest", new_cluster.pgdata); unlink(new_link_file); /* might fail */ if (pg_link_file(existing_file, new_link_file) == -1) { pg_fatal("Could not create hard link between old and new data directories: %s\n" "In link mode the old and new data directories must be on the same file system volume.\n", getErrorText()); } unlink(new_link_file); } #ifdef WIN32 static int win32_pghardlink(const char *src, const char *dst) { /* * CreateHardLinkA returns zero for failure * http://msdn.microsoft.com/en-us/library/aa363860(VS.85).aspx */ if (CreateHardLinkA(dst, src, NULL) == 0) return -1; else return 0; } #endif /* fopen() file with no group/other permissions */ FILE * fopen_priv(const char *path, const char *mode) { mode_t old_umask = umask(S_IRWXG | S_IRWXO); FILE *fp; fp = fopen(path, mode); umask(old_umask); return fp; }