Source: object-graph-bottom-up-iterator.ts

/* eslint-disable no-await-in-loop */

/**
 * @author Michael Hasenstein <hasenstein@yahoo.com>
 * @copyright REFINIO GmbH 2017
 * @license CC-BY-NC-SA-2.5; portions MIT License
 * @version 0.0.1
 */

/**
 * ONE objects in storage form a directed graph. When ID object links are used they may form
 * cycles, without ID links cycles are not possible (one would have to calculate the SHA-256
 * hashes of objects directly or indirectly pointing at one another).
 *
 * This module implements a general ONE object graph iterator. It takes the hash of a concrete
 * ONE object and then iterates over every single reachable node in the graph using a DEPTH
 * FIRST strategy to avoid too much parallelism (too many open paths, potentially too much
 * memory usage). It calls the given callback function for each ONE object node that it visits.
 *
 * It only descends if the linked file is a ONE object, it does not descend for links to BLOBs.
 * We know those always are leave nodes though and those links are made available with the
 * current object.
 *
 * @module
 */

export interface IteratorCbParamObjType {
    type: typeof HashLinkType.OBJ;
    hash: SHA256Hash;
    obj: OneObjectTypes;
    links: LinkedObjectsHashList;
    path: ReadonlyArray<SHA256Hash<HashTypes> | SHA256IdHash>;
    size: number | undefined;
}

export interface IteratorCbParamIdType {
    type: typeof HashLinkType.ID;
    hash: SHA256IdHash;
    obj: OneIdObjectTypes;
    links: LinkedObjectsHashList;
    path: ReadonlyArray<SHA256Hash<HashTypes> | SHA256IdHash>;
    size: number | undefined;
}

export interface IteratorCbParamBlobType {
    type: typeof HashLinkType.BLOB;
    hash: SHA256Hash<BLOB>;
    path: ReadonlyArray<SHA256Hash<HashTypes> | SHA256IdHash>;
    size: number | undefined;
}

export interface IteratorCbParamClobType {
    type: typeof HashLinkType.CLOB;
    hash: SHA256Hash<CLOB>;
    path: ReadonlyArray<SHA256Hash<HashTypes> | SHA256IdHash>;
    size: number | undefined;
}

/**
 * The callback function given to the {@link object-graph-iterator.module:ts|object-graph-iterator}
 * to call for each object receives an object of this type as its parameter.
 * @typedef {object} IteratorCbParam
 * @property {('hash'|'id'|'blob'|'clob')} type
 */
export type IteratorCbParam =
    | IteratorCbParamObjType
    | IteratorCbParamIdType
    | IteratorCbParamBlobType
    | IteratorCbParamClobType;

/**
 * This object provides human-readable names for constants used in {@link
 * ObjectGraphIteratorOptions}. The object-graph-iterator has these three options how to treat ID
 * hash links.
 * - It can iterate over the ID object itself
 * - It can find the latest version for the ID hash and continue iteration with the object the
 *   hash points to.
 * - It can find _all_ versions for the ID hash and continue iteration with all the object the
 *   hashes points to.
 * - It can ignore ID references altogether
 * @global
 * @type {{ID_OBJECT: 0, ALL_VERSIONS: 2, LATEST_VERSION: 1, NONE: 3}}
 */
export const ID_LINK_ITERATION = {
    ID_OBJECT: 0,
    LATEST_VERSION: 1,
    ALL_VERSIONS: 2,
    NONE: 3
} as const;

/**
 * The actual numeric values of the {@link ID_LINK_ITERATION} enum (object).
 * @global
 * @type {Set<0 | 1 | 2 | 3>}
 */
export const ID_LINK_ITERATION_VALUES = new Set(Object.values(ID_LINK_ITERATION));

/**
 * The options object parameter of the
 * {@link object-graph-iterator.module:ts.iterateGraphFromObjectBottomUp|`iterateGraphFromObjectBottomUp`} function.
 * @typedef {object} ObjectGraphIteratorOptions
 * @property {(0|1|2)} [idLinkIteration] - How to handle ID hash links: Not at all, iterate to
 * the latest version (hash), iterate over all versions (hashes)
 * @property {boolean} [includeFileSize] - If `true` the callback's {@link IteratorCbParam}
 * parameter is going to include the byte includeFileSize of the current node/object. When
 * storage encryption is enabled the includeFileSize will be +-16 bytes of the true value due
 * to random padding of 32 bytes added to the encrypted container.
 * {@link iterateGraphFromAllVersionsOfIdObject} because when iterating over an ID object the
 * iterator automatically uses each version's timestamp.
 */
export interface ObjectGraphIteratorOptions {
    idLinkIteration?: (typeof ID_LINK_ITERATION)[keyof typeof ID_LINK_ITERATION];
    includeFileSize?: boolean;
}

interface WalkerOptions {
    cb: (param: IteratorCbParam) => void | Promise<void>;
    idLinkIteration: (typeof ID_LINK_ITERATION)[keyof typeof ID_LINK_ITERATION];
    includeFileSize: boolean;
}

/**
 * A `StopIteratorError` is a type of `Error` with a fixed name property string value.
 * You can stop the object graph iteration without the iterator promise rejecting by throwing an
 * Error whose `name` property is "StopIteratorError" from within the callback function. Any
 * other error causes the iterator promise to reject with that error.
 *
 * **Note that this is not a special class**, i.e. you cannot use `instanceof`. The object is
 * created by _decorating_ a given `Error` object. To check for this error check the "`name`"
 * property!
 * @global
 * @typedef {object} StopIteratorError
 * @property {'StopIteratorError'} name - Fixed string 'StopIteratorError'
 * @property {string} message - An error message - but only the name property is important
 */
export interface StopIteratorError extends Error {
    name: 'StopIteratorError';
}

/**
 * A concrete version hash of a versioned object identified by an ID hash, and the timestamp of
 * version map entry creation for that concrete hash.
 * @typedef {object} HashAndIdHashAndTimestamp
 * @property {SHA256Hash} hash
 * @property {SHA256IdHash} idHash
 * @property {number} timestamp
 */
export interface HashAndIdHashAndTimestamp<
    T extends OneVersionedObjectTypes = OneVersionedObjectTypes
> {
    // NOTE: Typescript does NOT check or enforce the two "T" to be equal - usually it is a
    // UNION TYPE and only those unions are equal. The actual name used can be any from the
    // union for either hash. We would have to "infer" a second type param by extracting the "T"
    // from one of the hashes and use the inferred name for the second hash. Now they are the
    // same but this interface becomes too complex.
    hash: SHA256Hash<T>;
    idHash: SHA256IdHash<T>;
    timestamp: number;
}

import {createError} from './errors';
import type {HashLinkTypeNames} from './object-to-microdata';
import {HashLinkType} from './object-to-microdata';
import type {
    BLOB,
    CLOB,
    HashTypes,
    OneIdObjectTypes,
    OneObjectTypes,
    OneVersionedObjectTypes
} from './recipes';
import {getObject} from './storage-unversioned-objects';
import {getIdObject} from './storage-versioned-objects';
import {fileSize} from './system/storage-base';
import {getAllHashesForIdHashes, getLatestHashesForIdHashes} from './util/hashes-from-id';
import type {LinkedObjectsHashList} from './util/object-find-links';
import {findLinkedHashesInObject} from './util/object-find-links';
import type {SHA256Hash, SHA256IdHash} from './util/type-checks';

/**
 * A string constant `'StopIteratorError'` for the name property of errors of this type. This
 * helps avoid having multiple versions of this same string in memory.
 * @private
 * @type {'StopIteratorError'}
 */
export const STOP_ITERATOR_ERROR = 'StopIteratorError';

/**
 * Convenience function which creates an `Error` object with its name property set to
 * `StopIteratorError`.
 * @static
 * @returns {StopIteratorError} Returns an error object with name property `StopIteratorError`
 */
export function createStopIteratorError(): StopIteratorError {
    return createError('OGI-ERR', {name: STOP_ITERATOR_ERROR}) as StopIteratorError;
}

/**
 * Internal graph iteration function that takes a lot of purely internal arguments for the
 * recursion
 * @private
 * @param {(SHA256Hash|SHA256IdHash)} hash
 * @param {HashLinkTypeNames} type
 * @param {Array<SHA256Hash|SHA256IdHash>} path
 * @param {ObjectGraphIteratorOptions} options
 * @param {Set<SHA256Hash|SHA256IdHash>} [visited] - Internal parameter to avoid duplicates.
 * It is supposed to be set to a new empty Set at the beginning of the iteration, so when
 * calling this function this should be a new empty Set or undefined. Subsequent recursive
 * iteration passes this Set through so that it is "global" for all iterations.
 * @returns {Promise<undefined|number>} When option `includeFileSize` is `true` resolves with the
 * size of the subtree under (and including) the given hash.
 */
async function iterate(
    hash: SHA256Hash<HashTypes> | SHA256IdHash,
    type: HashLinkTypeNames,
    path: ReadonlyArray<SHA256Hash<HashTypes> | SHA256IdHash>,
    {cb, idLinkIteration, includeFileSize}: WalkerOptions,
    visited: Set<SHA256Hash<HashTypes> | SHA256IdHash>
): Promise<void> {
    // This should only ever be possible at iteration start, but we place it in the recursive
    // function just in case there is anything weird in the data.
    if (hash === undefined) {
        throw createError('OGI-WALK1');
    }

    if (!ID_LINK_ITERATION_VALUES.has(idLinkIteration)) {
        throw createError('OGI-WALK2', {all: ID_LINK_ITERATION_VALUES, val: idLinkIteration});
    }

    // Every node is iterated over only once no matter how often it appears in the graph
    if (visited.has(hash)) {
        return;
    }

    visited.add(hash);

    const size = includeFileSize ? await fileSize(hash) : undefined;

    // Optimization: Create a copy only once.
    // WARNING: This array could be mutated by the callback if it's called. If we want a
    // guarantee we would have to copy this array. I'm not doing that now to save the memory.
    const newPath = [...path, hash] as const;

    if (type === HashLinkType.BLOB) {
        await cb({
            type: HashLinkType.BLOB,
            hash: hash as SHA256Hash<BLOB>,
            path,
            size
        } as const);
        return;
    }

    if (type === HashLinkType.CLOB) {
        await cb({
            type: HashLinkType.CLOB,
            hash: hash as SHA256Hash<CLOB>,
            path,
            size
        } as const);
        return;
    }

    const obj =
        type === HashLinkType.ID
            ? await getIdObject(hash as SHA256IdHash)
            : await getObject(hash as SHA256Hash);

    const links = findLinkedHashesInObject(obj);

    const options = {
        cb,
        idLinkIteration,
        includeFileSize
    } as const;

    // SEQUENTIAL asynchronous iteration to avoid too many simultaneous operations. This way
    // we follow a single path to a single leaf node to the end before exploring the next path.
    // WARNING: links.references could have been mutated by the callback if it was called. If we
    // want a guarantee we would have to copy this array. I'm not doing that now to save the memory.
    for (const ref of links.references) {
        // eslint-disable-next-line no-await-in-loop
        await iterate(ref, HashLinkType.OBJ, newPath, options, visited);
    }

    if (idLinkIteration === ID_LINK_ITERATION.ID_OBJECT) {
        for (const idRef of links.idReferences) {
            // eslint-disable-next-line no-await-in-loop
            await iterate(idRef, HashLinkType.ID, newPath, options, visited);
        }
    }

    if (
        links.idReferences.length > 0 &&
        (idLinkIteration === ID_LINK_ITERATION.LATEST_VERSION ||
            idLinkIteration === ID_LINK_ITERATION.ALL_VERSIONS)
    ) {
        const hashesForIdReferences =
            idLinkIteration === ID_LINK_ITERATION.LATEST_VERSION
                ? await getLatestHashesForIdHashes(links.idReferences)
                : await getAllHashesForIdHashes(links.idReferences);

        for (const {hash: childHash} of hashesForIdReferences) {
            // Sequentially - not in parallel
            // eslint-disable-next-line no-await-in-loop
            await iterate(
                childHash,
                HashLinkType.OBJ,
                newPath,
                // ID reference special: We have a timestamp for the current (sub-graph root-)
                // object, since it can be newer than the "global" root object.
                {
                    cb,
                    idLinkIteration,
                    includeFileSize
                },
                visited
            );
        }
    }

    for (const blobHash of links.blobs) {
        await iterate(blobHash, HashLinkType.BLOB, newPath, options, visited);
    }

    for (const clobHash of links.clobs) {
        await iterate(clobHash, HashLinkType.CLOB, newPath, options, visited);
    }

    await cb(
        type === HashLinkType.OBJ
            ? ({
                  type: HashLinkType.OBJ,
                  hash: hash as SHA256Hash,
                  obj: obj as OneObjectTypes,
                  links,
                  path,
                  size
              } as const)
            : ({
                  type: 'id',
                  hash: hash as SHA256IdHash,
                  obj: obj as OneIdObjectTypes,
                  links,
                  path,
                  size
              } as const)
    );
}

/**
 * Function to iterate over a ONE object graph in storage starting from a given root node. The
 * function returns a promise that resolves when all nodes have been visited, or rejects when
 * there was an error - for example in the given per-node callback function.
 *
 * - Depth first: The algorithm descends the first link it finds in an object, and only when
 *   it went all the way down and visited all nodes on the linked object does it visit the next
 *   link in the current object.
 *
 * - Direction either is top-down (default) or bottom-up.
 *   Top down: The algorithm calls the given callback function for each node **before** it
 *   visits any linked nodes. That means the callback function is first called for parent nodes,
 *   and then for the children (descending into the graph).
 *   Bottom up: The algorithm only calls the given callback function for each node **after** it
 *   has visited all its linked nodes. That means the callback function is called for leaf nodes
 *   first, then for its parents.
 *
 * - Every node is sent only once even if it is linked more than once.
 *
 * - Child node iteration order is consistent but depends on the recipe and the data: If your
 *   Reference objects are in an array their order may be either determined by your application
 *   or by ONE, which sorts them alphabetically to ensure microdata that always produces the
 *   same hash.
 *
 * - Versioned and unversioned concrete object hash links: The callback is called for both kinds
 *   of ONE objects. The objects are loaded from storage and the callback is called.
 *
 * - Clob/Blob hash links: Those are iterated over just like the ONE object and ONE ID object
 *   nodes. That means that the callback **is** called, but the data it receives is slightly
 *   different from the one for ONE [ID] objects, see {@link IteratorCbParam} union type.
 *
 * - ID object links: The behavior when iterating over ID links is configurable (option
 *   parameter). The callback function is called for each version or only for the latest
 *   version. Cycles created by ID object links are detected and iterated over only once.
 *
 * You can **stop iteration** without the iterator promise rejecting by throwing an Error
 * whose `name` property is "StopIteratorError" from within the callback function. Any other error
 * causes the iterator promise to reject with that error.
 *
 * @static
 * @async
 * @param {SHA256Hash} hash - The SHA-256 hash of the ONE object in storage that is to be used as
 * root node
 * @param {function(IteratorCbParam):undefined|Promise<undefined>} cb - Callback function called
 * for each node (ONE object, ONE ID object if ID object iteration is enabled, BLOB or CLOB). If
 * the callback function returns a promise the iteration waits until the promise resolves before
 * proceeding. Return values are ignored.
 * @param {ObjectGraphIteratorOptions} [options] - An optional options object
 * @returns {Promise<boolean>} Returns a promise that resolves when the iteration is over. It
 * resolves with `true` when the iteration completed, i.e. all objects in the graph were
 * visited. It resolves with `false` if the iterator was stopped by the callback using an Error
 * whose name property is `StopIteratorError`.
 */
export async function iterateGraphFromObjectBottomUp(
    hash: SHA256Hash,
    cb: (param: IteratorCbParam) => void | Promise<void>,
    {
        idLinkIteration = ID_LINK_ITERATION.ID_OBJECT,
        includeFileSize = false
    }: ObjectGraphIteratorOptions = {}
): Promise<boolean> {
    try {
        await iterate(
            hash,
            HashLinkType.OBJ,
            [],
            {
                cb,
                idLinkIteration,
                includeFileSize
            },
            new Set()
        );
    } catch (err) {
        // If the callback function throws this particular error the iterator just stops, but
        // the overall promise status remains a success.
        if (err.name === STOP_ITERATOR_ERROR) {
            return false;
        }

        throw err;
    }

    return true;
}