Source code for yann_utils.chunking

"""Utilities for chunking objects into bytes and reconstructing them."""""

import os
import pickle
import re
from pathlib import Path
from typing import Any, Iterable, Union


[docs] def chunk_obj(obj: Any, chunk_size_mb: int) -> Iterable[bytes]: """Chunk an object into bytes of a given size.""" ser = pickle.dumps(obj) chunk_size_b = chunk_size_mb * 1024**2 return (ser[i : i + chunk_size_b] for i in range(0, len(ser), chunk_size_b))
[docs] def reconstruct_obj(chunks: Iterable[bytes]) -> Any: """Reconstruct an object from its chunks of bytes.""" acc = [] for chunk in chunks: with open(chunk, "rb") as file: acc.append(file.read()) return b"".join(acc)
[docs] def persist_chunks(chunks: Iterable[bytes], dir: str) -> None: """Persist chunks to disk and return directory.""" os.makedirs(dir, exist_ok=True) for i, chunk in enumerate(chunks): with open(f"{dir}/pkl.part{i}", "wb") as file: file.write(chunk)
[docs] def get_numeric_suffix(file_name: str) -> None: """Get the numeric suffix of a file name.""" match = re.search(r"\d+$", file_name) if not match: raise ValueError(f"File name {file_name} does not contain a numeric suffix") return int(match.group())
[docs] def get_chunks(dir: str) -> Iterable[str]: """Get the chunks of an object from a directory.""" chunks = Path(dir).glob("pkl.part*") chunks = [str(path) for path in chunks] chunks = sorted(chunks, key=get_numeric_suffix) return chunks