/* eslint-disable no-await-in-loop */
/**
* @author Michael Hasenstein <hasenstein@yahoo.com>
* @copyright REFINIO GmbH 2017
* @license CC-BY-NC-SA-2.5; portions MIT License
* @version 0.0.1
*/
/**
* ONE objects in storage form a directed graph. When ID object links are used they may form
* cycles, without ID links cycles are not possible (one would have to calculate the SHA-256
* hashes of objects directly or indirectly pointing at one another).
*
* This module implements a general ONE object graph iterator. It takes the hash of a concrete
* ONE object and then iterates over every single reachable node in the graph using a DEPTH
* FIRST strategy to avoid too much parallelism (too many open paths, potentially too much
* memory usage). It calls the given callback function for each ONE object node that it visits.
*
* It only descends if the linked file is a ONE object, it does not descend for links to BLOBs.
* We know those always are leave nodes though and those links are made available with the
* current object.
*
* @module
*/
export interface IteratorCbParamObjType {
type: typeof HashLinkType.OBJ;
hash: SHA256Hash;
obj: OneObjectTypes;
links: LinkedObjectsHashList;
path: ReadonlyArray<SHA256Hash<HashTypes> | SHA256IdHash>;
size: number | undefined;
}
export interface IteratorCbParamIdType {
type: typeof HashLinkType.ID;
hash: SHA256IdHash;
obj: OneIdObjectTypes;
links: LinkedObjectsHashList;
path: ReadonlyArray<SHA256Hash<HashTypes> | SHA256IdHash>;
size: number | undefined;
}
export interface IteratorCbParamBlobType {
type: typeof HashLinkType.BLOB;
hash: SHA256Hash<BLOB>;
path: ReadonlyArray<SHA256Hash<HashTypes> | SHA256IdHash>;
size: number | undefined;
}
export interface IteratorCbParamClobType {
type: typeof HashLinkType.CLOB;
hash: SHA256Hash<CLOB>;
path: ReadonlyArray<SHA256Hash<HashTypes> | SHA256IdHash>;
size: number | undefined;
}
/**
* The callback function given to the {@link object-graph-iterator.module:ts|object-graph-iterator}
* to call for each object receives an object of this type as its parameter.
* @typedef {object} IteratorCbParam
* @property {('hash'|'id'|'blob'|'clob')} type
*/
export type IteratorCbParam =
| IteratorCbParamObjType
| IteratorCbParamIdType
| IteratorCbParamBlobType
| IteratorCbParamClobType;
/**
* This object provides human-readable names for constants used in {@link
* ObjectGraphIteratorOptions}. The object-graph-iterator has these three options how to treat ID
* hash links.
* - It can iterate over the ID object itself
* - It can find the latest version for the ID hash and continue iteration with the object the
* hash points to.
* - It can find _all_ versions for the ID hash and continue iteration with all the object the
* hashes points to.
* - It can ignore ID references altogether
* @global
* @type {{ID_OBJECT: 0, ALL_VERSIONS: 2, LATEST_VERSION: 1, NONE: 3}}
*/
export const ID_LINK_ITERATION = {
ID_OBJECT: 0,
LATEST_VERSION: 1,
ALL_VERSIONS: 2,
NONE: 3
} as const;
/**
* The actual numeric values of the {@link ID_LINK_ITERATION} enum (object).
* @global
* @type {Set<0 | 1 | 2 | 3>}
*/
export const ID_LINK_ITERATION_VALUES = new Set(Object.values(ID_LINK_ITERATION));
/**
* The options object parameter of the
* {@link object-graph-iterator.module:ts.iterateGraphFromObjectBottomUp|`iterateGraphFromObjectBottomUp`} function.
* @typedef {object} ObjectGraphIteratorOptions
* @property {(0|1|2)} [idLinkIteration] - How to handle ID hash links: Not at all, iterate to
* the latest version (hash), iterate over all versions (hashes)
* @property {boolean} [includeFileSize] - If `true` the callback's {@link IteratorCbParam}
* parameter is going to include the byte includeFileSize of the current node/object. When
* storage encryption is enabled the includeFileSize will be +-16 bytes of the true value due
* to random padding of 32 bytes added to the encrypted container.
* {@link iterateGraphFromAllVersionsOfIdObject} because when iterating over an ID object the
* iterator automatically uses each version's timestamp.
*/
export interface ObjectGraphIteratorOptions {
idLinkIteration?: (typeof ID_LINK_ITERATION)[keyof typeof ID_LINK_ITERATION];
includeFileSize?: boolean;
}
interface WalkerOptions {
cb: (param: IteratorCbParam) => void | Promise<void>;
idLinkIteration: (typeof ID_LINK_ITERATION)[keyof typeof ID_LINK_ITERATION];
includeFileSize: boolean;
}
/**
* A `StopIteratorError` is a type of `Error` with a fixed name property string value.
* You can stop the object graph iteration without the iterator promise rejecting by throwing an
* Error whose `name` property is "StopIteratorError" from within the callback function. Any
* other error causes the iterator promise to reject with that error.
*
* **Note that this is not a special class**, i.e. you cannot use `instanceof`. The object is
* created by _decorating_ a given `Error` object. To check for this error check the "`name`"
* property!
* @global
* @typedef {object} StopIteratorError
* @property {'StopIteratorError'} name - Fixed string 'StopIteratorError'
* @property {string} message - An error message - but only the name property is important
*/
export interface StopIteratorError extends Error {
name: 'StopIteratorError';
}
/**
* A concrete version hash of a versioned object identified by an ID hash, and the timestamp of
* version map entry creation for that concrete hash.
* @typedef {object} HashAndIdHashAndTimestamp
* @property {SHA256Hash} hash
* @property {SHA256IdHash} idHash
* @property {number} timestamp
*/
export interface HashAndIdHashAndTimestamp<
T extends OneVersionedObjectTypes = OneVersionedObjectTypes
> {
// NOTE: Typescript does NOT check or enforce the two "T" to be equal - usually it is a
// UNION TYPE and only those unions are equal. The actual name used can be any from the
// union for either hash. We would have to "infer" a second type param by extracting the "T"
// from one of the hashes and use the inferred name for the second hash. Now they are the
// same but this interface becomes too complex.
hash: SHA256Hash<T>;
idHash: SHA256IdHash<T>;
timestamp: number;
}
import {createError} from './errors';
import type {HashLinkTypeNames} from './object-to-microdata';
import {HashLinkType} from './object-to-microdata';
import type {
BLOB,
CLOB,
HashTypes,
OneIdObjectTypes,
OneObjectTypes,
OneVersionedObjectTypes
} from './recipes';
import {getObject} from './storage-unversioned-objects';
import {getIdObject} from './storage-versioned-objects';
import {fileSize} from './system/storage-base';
import {getAllHashesForIdHashes, getLatestHashesForIdHashes} from './util/hashes-from-id';
import type {LinkedObjectsHashList} from './util/object-find-links';
import {findLinkedHashesInObject} from './util/object-find-links';
import type {SHA256Hash, SHA256IdHash} from './util/type-checks';
/**
* A string constant `'StopIteratorError'` for the name property of errors of this type. This
* helps avoid having multiple versions of this same string in memory.
* @private
* @type {'StopIteratorError'}
*/
export const STOP_ITERATOR_ERROR = 'StopIteratorError';
/**
* Convenience function which creates an `Error` object with its name property set to
* `StopIteratorError`.
* @static
* @returns {StopIteratorError} Returns an error object with name property `StopIteratorError`
*/
export function createStopIteratorError(): StopIteratorError {
return createError('OGI-ERR', {name: STOP_ITERATOR_ERROR}) as StopIteratorError;
}
/**
* Internal graph iteration function that takes a lot of purely internal arguments for the
* recursion
* @private
* @param {(SHA256Hash|SHA256IdHash)} hash
* @param {HashLinkTypeNames} type
* @param {Array<SHA256Hash|SHA256IdHash>} path
* @param {ObjectGraphIteratorOptions} options
* @param {Set<SHA256Hash|SHA256IdHash>} [visited] - Internal parameter to avoid duplicates.
* It is supposed to be set to a new empty Set at the beginning of the iteration, so when
* calling this function this should be a new empty Set or undefined. Subsequent recursive
* iteration passes this Set through so that it is "global" for all iterations.
* @returns {Promise<undefined|number>} When option `includeFileSize` is `true` resolves with the
* size of the subtree under (and including) the given hash.
*/
async function iterate(
hash: SHA256Hash<HashTypes> | SHA256IdHash,
type: HashLinkTypeNames,
path: ReadonlyArray<SHA256Hash<HashTypes> | SHA256IdHash>,
{cb, idLinkIteration, includeFileSize}: WalkerOptions,
visited: Set<SHA256Hash<HashTypes> | SHA256IdHash>
): Promise<void> {
// This should only ever be possible at iteration start, but we place it in the recursive
// function just in case there is anything weird in the data.
if (hash === undefined) {
throw createError('OGI-WALK1');
}
if (!ID_LINK_ITERATION_VALUES.has(idLinkIteration)) {
throw createError('OGI-WALK2', {all: ID_LINK_ITERATION_VALUES, val: idLinkIteration});
}
// Every node is iterated over only once no matter how often it appears in the graph
if (visited.has(hash)) {
return;
}
visited.add(hash);
const size = includeFileSize ? await fileSize(hash) : undefined;
// Optimization: Create a copy only once.
// WARNING: This array could be mutated by the callback if it's called. If we want a
// guarantee we would have to copy this array. I'm not doing that now to save the memory.
const newPath = [...path, hash] as const;
if (type === HashLinkType.BLOB) {
await cb({
type: HashLinkType.BLOB,
hash: hash as SHA256Hash<BLOB>,
path,
size
} as const);
return;
}
if (type === HashLinkType.CLOB) {
await cb({
type: HashLinkType.CLOB,
hash: hash as SHA256Hash<CLOB>,
path,
size
} as const);
return;
}
const obj =
type === HashLinkType.ID
? await getIdObject(hash as SHA256IdHash)
: await getObject(hash as SHA256Hash);
const links = findLinkedHashesInObject(obj);
const options = {
cb,
idLinkIteration,
includeFileSize
} as const;
// SEQUENTIAL asynchronous iteration to avoid too many simultaneous operations. This way
// we follow a single path to a single leaf node to the end before exploring the next path.
// WARNING: links.references could have been mutated by the callback if it was called. If we
// want a guarantee we would have to copy this array. I'm not doing that now to save the memory.
for (const ref of links.references) {
// eslint-disable-next-line no-await-in-loop
await iterate(ref, HashLinkType.OBJ, newPath, options, visited);
}
if (idLinkIteration === ID_LINK_ITERATION.ID_OBJECT) {
for (const idRef of links.idReferences) {
// eslint-disable-next-line no-await-in-loop
await iterate(idRef, HashLinkType.ID, newPath, options, visited);
}
}
if (
links.idReferences.length > 0 &&
(idLinkIteration === ID_LINK_ITERATION.LATEST_VERSION ||
idLinkIteration === ID_LINK_ITERATION.ALL_VERSIONS)
) {
const hashesForIdReferences =
idLinkIteration === ID_LINK_ITERATION.LATEST_VERSION
? await getLatestHashesForIdHashes(links.idReferences)
: await getAllHashesForIdHashes(links.idReferences);
for (const {hash: childHash} of hashesForIdReferences) {
// Sequentially - not in parallel
// eslint-disable-next-line no-await-in-loop
await iterate(
childHash,
HashLinkType.OBJ,
newPath,
// ID reference special: We have a timestamp for the current (sub-graph root-)
// object, since it can be newer than the "global" root object.
{
cb,
idLinkIteration,
includeFileSize
},
visited
);
}
}
for (const blobHash of links.blobs) {
await iterate(blobHash, HashLinkType.BLOB, newPath, options, visited);
}
for (const clobHash of links.clobs) {
await iterate(clobHash, HashLinkType.CLOB, newPath, options, visited);
}
await cb(
type === HashLinkType.OBJ
? ({
type: HashLinkType.OBJ,
hash: hash as SHA256Hash,
obj: obj as OneObjectTypes,
links,
path,
size
} as const)
: ({
type: 'id',
hash: hash as SHA256IdHash,
obj: obj as OneIdObjectTypes,
links,
path,
size
} as const)
);
}
/**
* Function to iterate over a ONE object graph in storage starting from a given root node. The
* function returns a promise that resolves when all nodes have been visited, or rejects when
* there was an error - for example in the given per-node callback function.
*
* - Depth first: The algorithm descends the first link it finds in an object, and only when
* it went all the way down and visited all nodes on the linked object does it visit the next
* link in the current object.
*
* - Direction either is top-down (default) or bottom-up.
* Top down: The algorithm calls the given callback function for each node **before** it
* visits any linked nodes. That means the callback function is first called for parent nodes,
* and then for the children (descending into the graph).
* Bottom up: The algorithm only calls the given callback function for each node **after** it
* has visited all its linked nodes. That means the callback function is called for leaf nodes
* first, then for its parents.
*
* - Every node is sent only once even if it is linked more than once.
*
* - Child node iteration order is consistent but depends on the recipe and the data: If your
* Reference objects are in an array their order may be either determined by your application
* or by ONE, which sorts them alphabetically to ensure microdata that always produces the
* same hash.
*
* - Versioned and unversioned concrete object hash links: The callback is called for both kinds
* of ONE objects. The objects are loaded from storage and the callback is called.
*
* - Clob/Blob hash links: Those are iterated over just like the ONE object and ONE ID object
* nodes. That means that the callback **is** called, but the data it receives is slightly
* different from the one for ONE [ID] objects, see {@link IteratorCbParam} union type.
*
* - ID object links: The behavior when iterating over ID links is configurable (option
* parameter). The callback function is called for each version or only for the latest
* version. Cycles created by ID object links are detected and iterated over only once.
*
* You can **stop iteration** without the iterator promise rejecting by throwing an Error
* whose `name` property is "StopIteratorError" from within the callback function. Any other error
* causes the iterator promise to reject with that error.
*
* @static
* @async
* @param {SHA256Hash} hash - The SHA-256 hash of the ONE object in storage that is to be used as
* root node
* @param {function(IteratorCbParam):undefined|Promise<undefined>} cb - Callback function called
* for each node (ONE object, ONE ID object if ID object iteration is enabled, BLOB or CLOB). If
* the callback function returns a promise the iteration waits until the promise resolves before
* proceeding. Return values are ignored.
* @param {ObjectGraphIteratorOptions} [options] - An optional options object
* @returns {Promise<boolean>} Returns a promise that resolves when the iteration is over. It
* resolves with `true` when the iteration completed, i.e. all objects in the graph were
* visited. It resolves with `false` if the iterator was stopped by the callback using an Error
* whose name property is `StopIteratorError`.
*/
export async function iterateGraphFromObjectBottomUp(
hash: SHA256Hash,
cb: (param: IteratorCbParam) => void | Promise<void>,
{
idLinkIteration = ID_LINK_ITERATION.ID_OBJECT,
includeFileSize = false
}: ObjectGraphIteratorOptions = {}
): Promise<boolean> {
try {
await iterate(
hash,
HashLinkType.OBJ,
[],
{
cb,
idLinkIteration,
includeFileSize
},
new Set()
);
} catch (err) {
// If the callback function throws this particular error the iterator just stops, but
// the overall promise status remains a success.
if (err.name === STOP_ITERATOR_ERROR) {
return false;
}
throw err;
}
return true;
}