Chapter 54Project Zip Unzip With Progress

Project

概述

上一个项目专注于确定性文本分析;现在我们将这些工件和周围的诊断打包到可重现的存档管道中。53 我们将编写一个极简主义的ZIP创建器,将文件流式传输到内存,发出中央目录,然后在报告增量进度时验证提取。该程序依赖于标准库的ZIP读取器、手动头部编码、用于CRC32检查的StringHashMap记帐,以及通过std.Progress进行结构化状态更新。zip.zighash_map.zigcrc.zigProgress.zig

学习目标

  • 通过按正确顺序编写本地文件头、中央目录和中央目录结束记录来从头组装ZIP档案,同时遵守大小和偏移约束。
  • 在包旁边捕获确定性完整性指标(CRC32、SHA-256),以便持续集成可以在每次运行时验证结构和内容。crypto.zig
  • 通过禁用动画渲染器并使用std.Progress发出纯文本检查点来显示分析器友好的进度消息。

设计管道

工作流程分为三个阶段:种子示例文件、构建存档以及提取和验证。每个阶段增加根进度节点,产生确定性控制台摘要,这些摘要可兼作验收标准。所有文件系统操作都在std.testing.tmpDir管理的临时目录下进行,保持实际工作空间清洁。47testing.zig

对于存档元数据,我们在编写头部和稍后验证提取的文件时重用相同的相对路径。在StringHashMap中存储每个路径的CRC32和字节数,使我们能够在提取后以直接的方式将预期与实际输出进行差异比较。

存档组装

因为Zig 0.15.2附带ZIP读取器而不是写入器,我们使用ArrayList(u8)在内存中构建存档,依次附加每个组件:本地文件头、文件名、文件字节。每个头部字段都使用显式的小端辅助函数写入,以便结果可在不同架构间移植。一旦有效负载进入blob,我们附加中央目录(每个文件一个记录),然后是中央目录结束记录,镜像PKWARE APPNOTE中定义的结构并在std.zip中编码。array_list.zigfmt.zig

在编写头部时,我们确保大小和偏移适合32位字段(坚持经典ZIP子集)并将文件名复制一次到映射中,以便稍后可以确定性地释放资源。存档镜像完成后,我们将其持久化到磁盘并计算SHA-256摘要以进行下游回归——摘要使用std.fmt.bytesToHex渲染,因此可以在内联比较而无需任何额外的工具。

提取和验证

提取重用标准库迭代器,它遍历每个中央目录记录并将数据流交给std.zip.Entry.extract;我们通过std.zip.Diagnostics规范化根文件夹名称,以便可以将其暴露给调用者。每个文件落地到磁盘后,我们再次计算CRC32并将字节数与记录的预期进行比较。任何不匹配都会立即使程序失败,使其安全地嵌入CI管道或部署钩子中。

std.Progress节点驱动控制台输出:根节点跟踪三个高级阶段,而子节点在种子、构建和验证期间计算文件列表。因为打印被禁用,最终消息是普通文本行(通过缓冲的stdout写入器渲染),可以在自动化测试中逐字差异。47

端到端实现

Zig
const std = @import("std");

const SampleFile = struct {
    path: []const u8,
    contents: []const u8,
};

const sample_files = [_]SampleFile{
    .{ .path = "input/metrics.txt", .contents = "uptime=420s\nrequests=1312\nerrors=3\n" },
    .{ .path = "input/inventory.json", .contents = "{\n  \"service\": \"telemetry\",\n  \"shards\": [\"alpha\", \"beta\", \"gamma\"]\n}\n" },
    .{ .path = "input/logs/app.log", .contents = "[info] ingest started\n[warn] queue delay=87ms\n[info] ingest completed\n" },
    .{ .path = "input/README.md", .contents = "# Telemetry bundle\n\nSynthetic records used for the zip/unzip progress demo.\n" },
};

const EntryMetrics = struct {
    crc32: u32,
    size: usize,
};

const BuildSummary = struct {
    bytes_written: usize,
    sha256: [32]u8,
};

const VerifySummary = struct {
    files_checked: usize,
    total_bytes: usize,
    extracted_root: []const u8,
    owns_root: bool,
};

const archive_path = "artifact/telemetry.zip";
const extract_root = "replay";

fn seedSamples(dir: std.fs.Dir, progress: *std.Progress.Node) !struct { files: usize, bytes: usize } {
    var total_bytes: usize = 0;
    for (sample_files) |sample| {
        if (std.fs.path.dirname(sample.path)) |parent| {
            try dir.makePath(parent);
        }
        var file = try dir.createFile(sample.path, .{ .truncate = true });
        defer file.close();
        try file.writeAll(sample.contents);
        total_bytes += sample.contents.len;
        progress.completeOne();
    }
    return .{ .files = sample_files.len, .bytes = total_bytes };
}

const EntryRecord = struct {
    name: []const u8,
    crc32: u32,
    size: u32,
    offset: u32,
};

fn makeLocalHeader(name_len: u16, crc32: u32, size: u32) [30]u8 {
    var header: [30]u8 = undefined;
    header[0] = 'P';
    header[1] = 'K';
    header[2] = 3;
    header[3] = 4;
    std.mem.writeInt(u16, header[4..6], 20, .little);
    std.mem.writeInt(u16, header[6..8], 0, .little);
    std.mem.writeInt(u16, header[8..10], 0, .little);
    std.mem.writeInt(u16, header[10..12], 0, .little);
    std.mem.writeInt(u16, header[12..14], 0, .little);
    std.mem.writeInt(u32, header[14..18], crc32, .little);
    std.mem.writeInt(u32, header[18..22], size, .little);
    std.mem.writeInt(u32, header[22..26], size, .little);
    std.mem.writeInt(u16, header[26..28], name_len, .little);
    std.mem.writeInt(u16, header[28..30], 0, .little);
    return header;
}

fn makeCentralHeader(entry: EntryRecord) [46]u8 {
    var header: [46]u8 = undefined;
    header[0] = 'P';
    header[1] = 'K';
    header[2] = 1;
    header[3] = 2;
    std.mem.writeInt(u16, header[4..6], 0x0314, .little);
    std.mem.writeInt(u16, header[6..8], 20, .little);
    std.mem.writeInt(u16, header[8..10], 0, .little);
    std.mem.writeInt(u16, header[10..12], 0, .little);
    std.mem.writeInt(u16, header[12..14], 0, .little);
    std.mem.writeInt(u16, header[14..16], 0, .little);
    std.mem.writeInt(u32, header[16..20], entry.crc32, .little);
    std.mem.writeInt(u32, header[20..24], entry.size, .little);
    std.mem.writeInt(u32, header[24..28], entry.size, .little);
    const name_len_u16 = @as(u16, @intCast(entry.name.len));
    std.mem.writeInt(u16, header[28..30], name_len_u16, .little);
    std.mem.writeInt(u16, header[30..32], 0, .little);
    std.mem.writeInt(u16, header[32..34], 0, .little);
    std.mem.writeInt(u16, header[34..36], 0, .little);
    std.mem.writeInt(u16, header[36..38], 0, .little);
    const unix_mode: u32 = 0o100644 << 16;
    std.mem.writeInt(u32, header[38..42], unix_mode, .little);
    std.mem.writeInt(u32, header[42..46], entry.offset, .little);
    return header;
}

fn makeEndRecord(cd_size: u32, cd_offset: u32, entry_count: u16) [22]u8 {
    var footer: [22]u8 = undefined;
    footer[0] = 'P';
    footer[1] = 'K';
    footer[2] = 5;
    footer[3] = 6;
    std.mem.writeInt(u16, footer[4..6], 0, .little);
    std.mem.writeInt(u16, footer[6..8], 0, .little);
    std.mem.writeInt(u16, footer[8..10], entry_count, .little);
    std.mem.writeInt(u16, footer[10..12], entry_count, .little);
    std.mem.writeInt(u32, footer[12..16], cd_size, .little);
    std.mem.writeInt(u32, footer[16..20], cd_offset, .little);
    std.mem.writeInt(u16, footer[20..22], 0, .little);
    return footer;
}

fn buildArchive(
    allocator: std.mem.Allocator,
    dir: std.fs.Dir,
    metrics: *std.StringHashMap(EntryMetrics),
    progress: *std.Progress.Node,
) !BuildSummary {
    if (std.fs.path.dirname(archive_path)) |parent| {
        try dir.makePath(parent);
    }
    var entries = try std.ArrayList(EntryRecord).initCapacity(allocator, sample_files.len);
    defer entries.deinit(allocator);

    try metrics.ensureTotalCapacity(sample_files.len);

    var blob: std.ArrayList(u8) = .empty;
    defer blob.deinit(allocator);

    for (sample_files) |sample| {
        if (sample.path.len > std.math.maxInt(u16)) return error.NameTooLong;

        var file = try dir.openFile(sample.path, .{});
        defer file.close();

        const max_len = 64 * 1024;
        const data = try file.readToEndAlloc(allocator, max_len);
        defer allocator.free(data);

        if (data.len > std.math.maxInt(u32)) return error.InputTooLarge;
        if (blob.items.len > std.math.maxInt(u32)) return error.ArchiveTooLarge;

        var crc = std.hash.crc.Crc32.init();
        crc.update(data);
        const digest = crc.final();

        const offset_u32 = @as(u32, @intCast(blob.items.len));
        const size_u32 = @as(u32, @intCast(data.len));
        const name_len_u16 = @as(u16, @intCast(sample.path.len));

        const header = makeLocalHeader(name_len_u16, digest, size_u32);
        try blob.appendSlice(allocator, header[0..]);
        try blob.appendSlice(allocator, sample.path);
        try blob.appendSlice(allocator, data);

        try entries.append(allocator, .{
            .name = sample.path,
            .crc32 = digest,
            .size = size_u32,
            .offset = offset_u32,
        });

        const gop = try metrics.getOrPut(sample.path);
        if (!gop.found_existing) {
            gop.key_ptr.* = try allocator.dupe(u8, sample.path);
        }
        gop.value_ptr.* = .{ .crc32 = digest, .size = data.len };

        progress.completeOne();
    }

    const central_offset_usize = blob.items.len;
    if (central_offset_usize > std.math.maxInt(u32)) return error.ArchiveTooLarge;
    const central_offset = @as(u32, @intCast(central_offset_usize));

    for (entries.items) |entry| {
        const header = makeCentralHeader(entry);
        try blob.appendSlice(allocator, header[0..]);
        try blob.appendSlice(allocator, entry.name);
    }

    const central_size = @as(u32, @intCast(blob.items.len - central_offset_usize));
    const footer = makeEndRecord(central_size, central_offset, @as(u16, @intCast(entries.items.len)));
    try blob.appendSlice(allocator, footer[0..]);

    var zip_file = try dir.createFile(archive_path, .{ .truncate = true, .read = true });
    defer zip_file.close();
    try zip_file.writeAll(blob.items);

    var sha256 = std.crypto.hash.sha2.Sha256.init(.{});
    sha256.update(blob.items);
    var digest_bytes: [32]u8 = undefined;
    sha256.final(&digest_bytes);

    return .{ .bytes_written = blob.items.len, .sha256 = digest_bytes };
}

fn extractAndVerify(
    allocator: std.mem.Allocator,
    dir: std.fs.Dir,
    metrics: *const std.StringHashMap(EntryMetrics),
    progress: *std.Progress.Node,
) !VerifySummary {
    try dir.makePath(extract_root);
    var dest_dir = try dir.openDir(extract_root, .{ .access_sub_paths = true, .iterate = true });
    defer dest_dir.close();

    var file = try dir.openFile(archive_path, .{});
    defer file.close();

    var read_buf: [4096]u8 = undefined;
    var reader = file.reader(&read_buf);

    var diagnostics = std.zip.Diagnostics{ .allocator = allocator };
    defer diagnostics.deinit();

    try std.zip.extract(dest_dir, &reader, .{ .diagnostics = &diagnostics });

    var files_checked: usize = 0;
    var total_bytes: usize = 0;

    for (sample_files) |sample| {
        var out_file = try dest_dir.openFile(sample.path, .{});
        defer out_file.close();
        const data = try out_file.readToEndAlloc(allocator, 64 * 1024);
        defer allocator.free(data);

        const expected = metrics.get(sample.path) orelse return error.ExpectedEntryMissing;
        var crc = std.hash.crc.Crc32.init();
        crc.update(data);
        if (crc.final() != expected.crc32 or data.len != expected.size) {
            return error.VerificationFailed;
        }
        files_checked += 1;
        total_bytes += data.len;
        progress.completeOne();
    }

    var result_root: []const u8 = "<scattered>";
    var owns_root = false;
    if (diagnostics.root_dir.len > 0) {
        result_root = try allocator.dupe(u8, diagnostics.root_dir);
        owns_root = true;
    }
    return .{
        .files_checked = files_checked,
        .total_bytes = total_bytes,
        .extracted_root = result_root,
        .owns_root = owns_root,
    };
}

pub fn main() !void {
    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
    defer {
        const leak_status = gpa.deinit();
        std.debug.assert(leak_status == .ok);
    }
    const allocator = gpa.allocator();

    var stdout_buffer: [512]u8 = undefined;
    var stdout_writer = std.fs.File.stdout().writer(&stdout_buffer);
    const out = &stdout_writer.interface;

    var tmp = std.testing.tmpDir(.{});
    defer tmp.cleanup();

    var metrics = std.StringHashMap(EntryMetrics).init(allocator);
    defer {
        var it = metrics.iterator();
        while (it.next()) |kv| {
            allocator.free(kv.key_ptr.*);
        }
        metrics.deinit();
    }

    var progress_root = std.Progress.start(.{
        .root_name = "zip-pipeline",
        .estimated_total_items = 3,
        .disable_printing = true,
    });
    defer progress_root.end();

    var stage_seed = progress_root.start("seed", sample_files.len);
    const seeded = try seedSamples(tmp.dir, &stage_seed);
    stage_seed.end();
    try out.print("[1/3] seeded samples -> files={d}, bytes={d}\n", .{ seeded.files, seeded.bytes });

    var stage_build = progress_root.start("build", sample_files.len);
    const build_summary = try buildArchive(allocator, tmp.dir, &metrics, &stage_build);
    stage_build.end();

    const hex_digest = std.fmt.bytesToHex(build_summary.sha256, .lower);
    try out.print("[2/3] built archive -> bytes={d}\n    sha256={s}\n", .{ build_summary.bytes_written, hex_digest[0..] });

    var stage_verify = progress_root.start("verify", sample_files.len);
    const verify_summary = try extractAndVerify(allocator, tmp.dir, &metrics, &stage_verify);
    stage_verify.end();
    defer if (verify_summary.owns_root) allocator.free(verify_summary.extracted_root);
    try out.print(
        "[3/3] extracted + verified -> files={d}, bytes={d}, root={s}\n",
        .{ verify_summary.files_checked, verify_summary.total_bytes, verify_summary.extracted_root },
    );

    try out.flush();
}
运行
Shell
$ zig run zip_progress_pipeline.zig
输出
Shell
[1/3] seeded samples -> files=4, bytes=250
[2/3] built archive -> bytes=716
    sha256=4a13a3dc1e6ef90c252b0cc797ff14456aa28c670cafbc9d27a025b0079b05d5
[3/3] extracted + verified -> files=4, bytes=250, root=input

验证步骤有意地复制提取的根字符串,当诊断发现公共前缀时;摘要随后释放该缓冲区以保持通用分配器清洁。这反映了通过临时目录流式传输大型存档的CLI工具的良好卫生习惯。52

注意事项和警告

  • 写入器坚持经典(非Zip64)子集;一旦文件超过4 GiB,您必须升级头部和额外字段,或委托给专用ZIP库。44
  • 进度节点是嵌套的,但打印被禁用;如果您想要实时TTY更新,请删除.disable_printing = true并让渲染器清除帧。请记住,这样做会牺牲捕获日志中的确定性。47
  • CRC32确认完整性但不确认真实性。将SHA-256摘要与签名结合,或将存档附加到zig build步骤以实现可重现的部署管道。39

练习

  • 扩展构建器以在任何文件超过4 GiB边界时发出Zip64记录。为小包保留传统路径,并编写验证两者的回归测试。33
  • 用流式写入器替换内存中的blob,该写入器分块刷新到磁盘;在perfzig build test下使用大型合成文件比较吞吐量和内存消耗。41
  • 添加一个命令行标志,在存档前接受忽略列表(glob模式),然后报告与现有总计一起跳过的确切文件数。36Dir.zig

警告、替代方案和边界情况

  • 将存档直接流式传输到stdout对管道很好,但使验证更棘手;考虑首先写入临时文件,以便在继续发送之前重新打开它进行校验和。28File.zig
  • ZIP加密故意超出范围。如果您需要保密性,请使用std.crypto原语包装结果文件,或切换到如带有age或minisign的加密tarball之类的格式。45
  • 对于多千兆字节语料库,分块读取输入并增量更新CRC32,而不是调用readToEndAlloc;否则临时分配器会膨胀。10

Help make this chapter better.

Found a typo, rough edge, or missing explanation? Open an issue or propose a small improvement on GitHub.