refactor: use yauzl for unzipping

This commit is contained in:
Michael Kreuzmayr 2023-03-30 22:56:58 +02:00
parent b0db8caf65
commit 08c7e38587
3 changed files with 110 additions and 171 deletions

View File

@ -69,6 +69,7 @@
"@types/minimist": "^1.2.2",
"@types/node": "^18.15.3",
"@types/react": "18.0.9",
"@types/yauzl": "^2.10.0",
"concurrently": "^7.6.0",
"copyfiles": "^2.4.1",
"husky": "^4.3.8",
@ -94,6 +95,7 @@
"react-markdown": "^5.0.3",
"rfc4648": "^1.5.2",
"tsafe": "^1.6.0",
"yauzl": "^2.10.0",
"zod": "^3.17.10"
}
}

View File

@ -1,184 +1,89 @@
import { createReadStream, createWriteStream } from "fs";
import { mkdir, stat, unlink } from "fs/promises";
import { dirname as pathDirname, join as pathJoin, relative as pathRelative } from "path";
import { type Readable } from "stream";
import { createInflateRaw } from "zlib";
import { partitionPromiseSettledResults } from "./partitionPromiseSettledResults";
import fsp from "node:fs/promises";
import fs from "fs";
import path from "node:path";
import yauzl from "yauzl";
import stream from "node:stream";
import { promisify } from "node:util";
export type MultiError = Error & { cause: Error[] };
const pipeline = promisify(stream.pipeline);
/**
* Extract the archive `zipFile` into the directory `dir`. If `archiveDir` is given,
* only that directory will be extracted, stripping the given path components.
*
* If dir does not exist, it will be created.
*
* If any archive file exists, it will be overwritten.
*
* Will unzip using all available nodejs worker threads.
*
* Will try to clean up extracted files on failure.
*
* If unpacking fails, will either throw an regular error, or
* possibly an `MultiError`, which contains a `cause` field with
* a number of root cause errors.
*
* Warning this method is not optimized for continuous reading of the zip
* archive, but is a trade-off between simplicity and allowing extraction
* of a single directory from the archive.
*
* @param zipFilePath the file to unzip
* @param extractDirPath the target directory
* @param pathOfDirToExtractInArchive if given, unpack only files from this archive directory
* @throws {MultiError} error
* @returns Promise for a list of full file paths pointing to actually extracted files
*/
export async function unzip(zipFilePath: string, extractDirPath: string, pathOfDirToExtractInArchive?: string): Promise<string[]> {
const dirsCreated: (string | undefined)[] = [];
dirsCreated.push(await mkdir(extractDirPath, { recursive: true }));
const promises: Promise<string>[] = [];
// Iterate over all files in the zip, skip files which are not in archiveDir,
// if given.
for await (const record of iterateZipArchive(zipFilePath)) {
const { path: recordPath, createReadStream: createRecordReadStream } = record;
if (pathOfDirToExtractInArchive && !recordPath.startsWith(pathOfDirToExtractInArchive)) {
continue;
async function pathExists(path: string) {
try {
await fsp.stat(path);
return true;
} catch (error) {
if ((error as { code: string }).code === "ENOENT") {
return false;
}
throw error;
}
const relativePath = pathOfDirToExtractInArchive ? pathRelative(pathOfDirToExtractInArchive, recordPath) : recordPath;
const filePath = pathJoin(extractDirPath, relativePath);
const parent = pathDirname(filePath);
promises.push(
new Promise<string>(async (resolve, reject) => {
if (!dirsCreated.includes(parent)) dirsCreated.push(await mkdir(parent, { recursive: true }));
// Pull the file out of the archive, write it to the target directory
const output = createWriteStream(filePath);
output.on("error", e => reject(Object.assign(e, { filePath })));
output.on("finish", () => resolve(filePath));
createRecordReadStream().pipe(output);
})
);
}
// Wait until _all_ files are either extracted or failed
const [success, failure] = (await Promise.allSettled(promises)).reduce(...partitionPromiseSettledResults<string>());
// If any extraction failed, try to clean up, then throw a MultiError,
// which has a `cause` field, containing a list of root cause errors.
if (failure.length) {
await Promise.all([
...success.map(path => unlink(path).catch(_unused => undefined)),
...failure.map(e => e && e.path && unlink(e.path as string).catch(_unused => undefined))
]);
await Promise.all(dirsCreated.filter(Boolean).sort(sortByFolderDepth("desc")));
const e = new Error("Failed to extract: " + failure.map(e => e.message).join(";"));
(e as any).cause = failure;
throw e;
export async function unzip(file: string, targetFolder: string, unzipSubPath?: string) {
// add trailing slash to unzipSubPath and targetFolder
if (unzipSubPath && (!unzipSubPath.endsWith("/") || !unzipSubPath.endsWith("\\"))) {
unzipSubPath += "/";
}
return success;
if (!targetFolder.endsWith("/") || !targetFolder.endsWith("\\")) {
targetFolder += "/";
}
function depth(dir: string) {
return dir.match(/\//g)?.length ?? 0;
return new Promise<void>((resolve, reject) => {
yauzl.open(file, { lazyEntries: true }, async (err, zipfile) => {
if (err) {
reject(err);
return;
}
function sortByFolderDepth(order: "asc" | "desc") {
const ord = order === "asc" ? 1 : -1;
return (a: string | undefined, b: string | undefined) => ord * depth(a ?? "") + -ord * depth(b ?? "");
zipfile.readEntry();
zipfile.on("entry", async entry => {
if (unzipSubPath) {
// Skip files outside of the unzipSubPath
if (!entry.fileName.startsWith(unzipSubPath)) {
zipfile.readEntry();
return;
}
/**
*
* @param file file to read
* @param start first byte to read
* @param end last byte to read
* @returns Promise of a buffer of read bytes
*/
async function readFileChunk(file: string, start: number, end: number): Promise<Buffer> {
const chunks: Buffer[] = [];
return new Promise((resolve, reject) => {
const stream = createReadStream(file, { start, end });
stream.setMaxListeners(Infinity);
stream.on("error", e => reject(e));
stream.on("end", () => resolve(Buffer.concat(chunks)));
stream.on("data", chunk => chunks.push(chunk as Buffer));
// Remove the unzipSubPath from the file name
entry.fileName = entry.fileName.substring(unzipSubPath.length);
}
const target = path.join(targetFolder, entry.fileName);
// Directory file names end with '/'.
// Note that entries for directories themselves are optional.
// An entry's fileName implicitly requires its parent directories to exist.
if (/[\/\\]$/.test(target)) {
await fsp.mkdir(target, { recursive: true });
zipfile.readEntry();
return;
}
// Skip existing files
if (await pathExists(target)) {
zipfile.readEntry();
return;
}
zipfile.openReadStream(entry, async (err, readStream) => {
if (err) {
reject(err);
return;
}
await pipeline(readStream, fs.createWriteStream(target));
zipfile.readEntry();
});
});
zipfile.once("end", function () {
zipfile.close();
resolve();
});
});
});
}
type ZipRecord = {
path: string;
createReadStream: () => Readable;
compressionMethod: "deflate" | undefined;
};
type ZipRecordGenerator = AsyncGenerator<ZipRecord, void, unknown>;
/**
* Iterate over all records of a zipfile, and yield a ZipRecord.
* Use `record.createReadStream()` to actually read the file.
*
* Warning this method will only work with single-disk zip files.
* Warning this method may fail if the zip archive has an crazy amount
* of files and the central directory is not fully contained within the
* last 65k bytes of the zip file.
*
* @param zipFile
* @returns AsyncGenerator which will yield ZipRecords
*/
async function* iterateZipArchive(zipFile: string): ZipRecordGenerator {
// Need to know zip file size before we can do anything else
const { size } = await stat(zipFile);
const chunkSize = 65_535 + 22 + 1; // max comment size + end header size + wiggle
// Read last ~65k bytes. Zip files have an comment up to 65_535 bytes at the very end,
// before that comes the zip central directory end header.
let chunk = await readFileChunk(zipFile, size - chunkSize, size);
const unread = size - chunk.length;
let i = chunk.length - 4;
let found = false;
// Find central directory end header, reading backwards from the end
while (!found && i-- > 0) if (chunk[i] === 0x50 && chunk.readUInt32LE(i) === 0x06054b50) found = true;
if (!found) throw new Error("Not a zip file");
// This method will fail on a multi-disk zip, so bail early.
if (chunk.readUInt16LE(i + 4) !== 0) throw new Error("Multi-disk zip not supported");
let nFiles = chunk.readUint16LE(i + 10);
// Get the position of the central directory
const directorySize = chunk.readUint32LE(i + 12);
const directoryOffset = chunk.readUint32LE(i + 16);
if (directoryOffset === 0xffff_ffff) throw new Error("zip64 not supported");
if (directoryOffset > size) throw new Error(`Central directory offset ${directoryOffset} is outside file`);
i = directoryOffset - unread;
// If i < 0, it means that the central directory is not contained within `chunk`
if (i < 0) {
chunk = await readFileChunk(zipFile, directoryOffset, directoryOffset + directorySize);
i = 0;
}
// Now iterate the central directory records, yield an `ZipRecord` for every entry
while (nFiles-- > 0) {
// Check for marker bytes
if (chunk.readUInt32LE(i) !== 0x02014b50) throw new Error("No central directory record at position " + (unread + i));
const compressionMethod = ({ 8: "deflate" } as const)[chunk.readUint16LE(i + 10)];
const compressedFileSize = chunk.readUint32LE(i + 20);
const filenameLength = chunk.readUint16LE(i + 28);
const extraLength = chunk.readUint16LE(i + 30);
const commentLength = chunk.readUint16LE(i + 32);
// Start of the actual content byte stream is after the 'local' record header,
// which is 30 bytes long plus filename and extra field
const start = chunk.readUint32LE(i + 42) + 30 + filenameLength + extraLength;
const end = start + compressedFileSize;
const filename = chunk.slice(i + 46, i + 46 + filenameLength).toString("utf-8");
const createRecordReadStream = () => {
const input = createReadStream(zipFile, { start, end });
if (compressionMethod === "deflate") {
const inflate = createInflateRaw();
input.pipe(inflate);
return inflate;
}
return input;
};
if (end > start) yield { path: filename, createReadStream: createRecordReadStream, compressionMethod };
// advance pointer to next central directory entry
i += 46 + filenameLength + extraLength + commentLength;
}
}

View File

@ -480,6 +480,13 @@
resolved "https://registry.yarnpkg.com/@types/unist/-/unist-2.0.6.tgz#250a7b16c3b91f672a24552ec64678eeb1d3a08d"
integrity sha512-PBjIUxZHOuj0R15/xuwJYjFi+KZdNFrehocChv4g5hu6aFroHue8m0lBP0POdK2nKzbw0cgV1mws8+V/JAcEkQ==
"@types/yauzl@^2.10.0":
version "2.10.0"
resolved "https://registry.yarnpkg.com/@types/yauzl/-/yauzl-2.10.0.tgz#b3248295276cf8c6f153ebe6a9aba0c988cb2599"
integrity sha512-Cn6WYCm0tXv8p6k+A8PvbDG763EDpBoTzHdA+Q/MF6H3sapGjCm9NzoaJncJS9tUKSuCoDs9XHxYYsQDgxR6kw==
dependencies:
"@types/node" "*"
acorn-walk@^8.1.1:
version "8.2.0"
resolved "https://registry.yarnpkg.com/acorn-walk/-/acorn-walk-8.2.0.tgz#741210f2e2426454508853a2f44d0ab83b7f69c1"
@ -635,6 +642,11 @@ browserslist@^4.21.3:
node-releases "^2.0.8"
update-browserslist-db "^1.0.10"
buffer-crc32@~0.2.3:
version "0.2.13"
resolved "https://registry.yarnpkg.com/buffer-crc32/-/buffer-crc32-0.2.13.tgz#0d333e3f00eac50aa1454abd30ef8c2a5d9a7242"
integrity sha512-VO9Ht/+p3SN7SKWqcrgEzjGbRSJYTx+Q1pTQC0wrWqHx0vpJraQ6GtHx8tvcg1rlK1byhU5gccxgOgj7B0TDkQ==
cacache@^17.0.0:
version "17.0.4"
resolved "https://registry.yarnpkg.com/cacache/-/cacache-17.0.4.tgz#5023ed892ba8843e3b7361c26d0ada37e146290c"
@ -1103,6 +1115,13 @@ fastq@^1.6.0:
dependencies:
reusify "^1.0.4"
fd-slicer@~1.1.0:
version "1.1.0"
resolved "https://registry.yarnpkg.com/fd-slicer/-/fd-slicer-1.1.0.tgz#25c7c89cb1f9077f8891bbe61d8f390eae256f1e"
integrity sha512-cE1qsB/VwyQozZ+q1dGxR8LBYNZeofhEdUNGSMbQD3Gw2lAzX9Zb3uIU6Ebc/Fmyjo9AWWfnn0AUCHqtevs/8g==
dependencies:
pend "~1.2.0"
fill-range@^7.0.1:
version "7.0.1"
resolved "https://registry.yarnpkg.com/fill-range/-/fill-range-7.0.1.tgz#1919a6a7c75fe38b2c7c77e5198535da9acdda40"
@ -1924,6 +1943,11 @@ path-type@^4.0.0:
resolved "https://registry.yarnpkg.com/path-type/-/path-type-4.0.0.tgz#84ed01c0a7ba380afe09d90a8c180dcd9d03043b"
integrity sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw==
pend@~1.2.0:
version "1.2.0"
resolved "https://registry.yarnpkg.com/pend/-/pend-1.2.0.tgz#7a57eb550a6783f9115331fcf4663d5c8e007a50"
integrity sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==
picocolors@^1.0.0:
version "1.0.0"
resolved "https://registry.yarnpkg.com/picocolors/-/picocolors-1.0.0.tgz#cb5bdc74ff3f51892236eaf79d68bc44564ab81c"
@ -2642,6 +2666,14 @@ yargs@^17.3.1:
y18n "^5.0.5"
yargs-parser "^21.1.1"
yauzl@^2.10.0:
version "2.10.0"
resolved "https://registry.yarnpkg.com/yauzl/-/yauzl-2.10.0.tgz#c7eb17c93e112cb1086fa6d8e51fb0667b79a5f9"
integrity sha512-p4a9I6X6nu6IhoGmBqAcbJy1mlC4j27vEPZX9F4L4/vZT3Lyq1VkFHw/V/PUcB9Buo+DG3iHkT0x3Qya58zc3g==
dependencies:
buffer-crc32 "~0.2.3"
fd-slicer "~1.1.0"
yn@3.1.1:
version "3.1.1"
resolved "https://registry.yarnpkg.com/yn/-/yn-3.1.1.tgz#1e87401a09d767c1d5eab26a6e4c185182d2eb50"