JSDoc: Source: microdata-exploder.ts

/**
 * @author Michael Hasenstein <hasenstein@yahoo.com>
 * @copyright REFINIO GmbH 2018
 * @license CC-BY-NC-SA-2.5; portions MIT License
 * @version 0.0.1
 */

/**
 * @module
 */

/* ***********************************************************************************************
 *
 * THIS MODULE IS ALMOST EXACTLY THE SAME AS microdata-to-object.js
 *
 * Main difference: all functions are asynchronous, and this is why I did not try to extract a
 * common core. As they are written right now they are pretty clear, if I try to handle both
 * synchronous and synchronous cases, or try to extract the tiny part that remains common, it
 * becomes a mess.
 *
 * Other differences:
 * - parseObject() replaces all included objects with a reference object (ID, versioned or
 *   unversioned reference depending on rule.referenceToObj or rule.referenceToId) after saving
 *   them as separate object
 * - "WriteStorage" is passed through to all functions. Only parseObject() needs it, but it is
 *   called from another function and calls another function which calls that other function...
 *   in order to keep the code as similar as possible to microdata-to-object I simply pass it
 *   through rather than doing something more fancy since it requires the smallest code changes.
 * - Parsing of the outer span tag of included objects ("inner header") needs to also parse tag
 *   attributes data-hash="..." and the optional data-id-hash="...", which only exist on
 *   imploded microdata.
 *
 * So, no "DRI" until somebody can figure out a better way - *that doesn't create less readable
 * code*.
 *
 *
 * SPEED INEFFICIENCY
 *
 * Also, to keep the code as much the same as possible to the one in microdata-to-object to not
 * make it harder to find common ground, we keep the sequential processing. However, we could
 * change the parsing to go ahead after spawning an asynchronous task and then end up having
 * several of them running in parallel, e.g. while parsing an array of values. That was useless
 * and undesirable in the purely synchronous parser because it does not save anything there, but
 * here we have to wait for included objects to be saved that could benefit from parallelization.
 *
 * ********************************************************************************************* */

/* eslint-disable no-await-in-loop */

/**
 * @private
 * @typedef {Array<OneObjectTypeNames|SHA256IdHash>} TypeAndHashAndIdHash
 */
type TypeAndHashAndIdHash = [OneObjectTypeNames, SHA256Hash, undefined | SHA256IdHash];

import {createError} from './errors';
import type {ParseContext} from './microdata-to-object';
import {
    breakMicrodataIntoArray,
    CONVERSION_FUNCTIONS,
    extractMicrodataWithTag,
    findClosingTag,
    unescapeFromHtml
} from './microdata-to-object';
import {
    ensureValidTypeName,
    getRecipe,
    isVersionedObject,
    resolveRuleInheritance
} from './object-recipes';
import type {HashLinkTypeNames} from './object-to-microdata';
import {HashLinkType} from './object-to-microdata';
import type {
    BagValue,
    MapValue,
    ObjectValue,
    OneObjectTypeNames,
    OneObjectTypes,
    RecipeRule,
    SetValue,
    ValueType
} from './recipes';
import type {AnyObjectCreation} from './storage-base-common';
import {storeUTF8Clob} from './storage-blob';
import {storeUnversionedObject} from './storage-unversioned-objects';
import {storeVersionedObject} from './storage-versioned-objects';
import {createFileWriteStream} from './system/storage-streams';
import {substrForceMemCopy} from './util/string';
import type {SHA256Hash, SHA256IdHash} from './util/type-checks';
import {ensureHash, ensureIdHash} from './util/type-checks';

const EXTRACT_FUNCTIONS = {
    primitive: extractPrimitiveTypeFromMicrodata,
    unorderedCollection: extractUnorderedListTypeFromMicrodata,
    orderedCollection: extractOrderedListTypeFromMicrodata,
    map: extractMapTypeFromMicrodata,
    obj: extracObjectTypeFromMicrodata
} as const;

const SPAN_END = '</span>';
const DIV_END = '</div>';

/**
 * Extracts any primitive type from the given microdata. {@link PrimitiveValueTypes}
 * @param {ParseContext} CONTEXT
 * @param {RecipeRule.itemprop} rule
 * @param {boolean} isNested
 * @returns {unknown}
 */
function extractPrimitiveTypeFromMicrodata(
    CONTEXT: ParseContext,
    rule: RecipeRule,
    isNested: boolean = false
): unknown {
    // if the given type is a nested one, the value exists by itself, without span tags
    if (isNested) {
        return CONTEXT.html;
    }

    const startStr = `<span itemprop="${rule.itemprop}">`;
    const startStrLen = startStr.length;

    if (!CONTEXT.html.startsWith(startStr, CONTEXT.position)) {
        return undefined;
    }

    const valueStart = CONTEXT.position + startStrLen;
    const valueEnd = CONTEXT.html.indexOf('<', valueStart);

    const valueStr = substrForceMemCopy(CONTEXT.html, valueStart, valueEnd - valueStart);
    CONTEXT.position += startStrLen + valueStr.length + SPAN_END.length;

    return valueStr;
}

/**
 * Extracts an ordered list from the given microdata.
 * @param {ParseContext} CONTEXT
 * @param {RecipeRule} rule
 * @param {StringValue | IntegerValue | NumberValue | BooleanValue | StringifiableValue |
 *     ReferenceToObjValue | ReferenceToIdValue | ReferenceToClobValue | ReferenceToBlobValue |
 *     MapValue | BagValue | ArrayValue | SetValue | ObjectValue} itemType
 * @param {boolean} isNested
 * @returns {unknown[] | undefined}
 */
async function extractOrderedListTypeFromMicrodata(
    CONTEXT: ParseContext,
    rule: RecipeRule,
    itemType: BagValue['item'] | SetValue['item'],
    isNested: boolean = false
): Promise<unknown[] | undefined> {
    let startStr;

    if (isNested) {
        startStr = '<ol>';
    } else {
        startStr = `<ol itemprop="${rule.itemprop}">`;
    }

    const endStr = '</ol>';
    const startStrLen = startStr.length;

    if (CONTEXT.html.startsWith(startStr + endStr)) {
        CONTEXT.position += (startStr + endStr).length;
        return [];
    }

    if (!CONTEXT.html.startsWith(startStr, CONTEXT.position)) {
        return undefined;
    }

    const closingTagPosition = findClosingTag(CONTEXT.html, CONTEXT.position + startStrLen);

    const extractedValues = breakMicrodataIntoArray(
        CONTEXT.html,
        '<li>',
        '</li>',
        CONTEXT.position + startStrLen,
        closingTagPosition
    );

    CONTEXT.position = closingTagPosition + endStr.length;

    return await Promise.all(
        extractedValues.map(async li => {
            return await parseMicrodataByTheExpectedType(
                itemType,
                rule,
                // isolated CONTEXT
                {
                    ...CONTEXT,
                    position: 0,
                    html: li
                }
            );
        })
    );
}

/**
 * Extracts map object from the given microdata.
 * @param {ParseContext} CONTEXT
 * @param {RecipeRule.itemprop} rule
 * @param {MapValue} valueType
 * @param {boolean} isNested
 * @returns {Map<unknown, unknown>}
 */
async function extractMapTypeFromMicrodata(
    CONTEXT: ParseContext,
    rule: RecipeRule,
    valueType: MapValue,
    isNested: boolean = false
): Promise<Promise<Map<unknown, unknown>> | undefined> {
    let startStr;

    if (isNested) {
        startStr = '<dl>';
    } else {
        startStr = `<dl itemprop="${rule.itemprop}">`;
    }

    const endStr = '</dl>';
    const startStrLen = startStr.length;

    if (CONTEXT.html.startsWith(startStr + endStr, CONTEXT.position)) {
        CONTEXT.position += (startStr + endStr).length;
        return new Map();
    }

    if (!CONTEXT.html.startsWith(startStr, CONTEXT.position)) {
        return undefined;
    }

    const endTagPosition = findClosingTag(CONTEXT.html, CONTEXT.position + startStrLen);

    const unWrappedMicrodata = CONTEXT.html.slice(CONTEXT.position + startStrLen, endTagPosition);

    const extractedValues = [];
    let newStartPos = 0;

    while (newStartPos < unWrappedMicrodata.length) {
        const newEndPosition = findClosingTag(unWrappedMicrodata, newStartPos + '<dt>'.length);
        extractedValues.push(unWrappedMicrodata.slice(newStartPos + '<dt>'.length, newEndPosition));
        newStartPos = newEndPosition + '</dt>'.length;
    }

    CONTEXT.position = endTagPosition + endStr.length;

    const valueMap = new Map();

    // policy: map's microdata always has one value as the key and one value as the map's value
    for (let i = 0; i < extractedValues.length; i += 2) {
        const key = await parseMicrodataByTheExpectedType(
            valueType.key,
            rule,
            // isolated CONTEXT
            {
                ...CONTEXT,
                position: 0,
                html: extractedValues[i]
            }
        );

        const value = await parseMicrodataByTheExpectedType(
            valueType.value,
            rule,
            // isolated CONTEXT
            {
                ...CONTEXT,
                position: 0,
                html: extractedValues[i + 1]
            }
        );

        valueMap.set(key, value);
    }

    return valueMap;
}

/**
 * Extracts an unordered list from the given microdata.
 * @param {ParseContext} CONTEXT
 * @param {RecipeRule} rule
 * @param {StringValue | IntegerValue | NumberValue | BooleanValue | StringifiableValue |
 *     ReferenceToObjValue | ReferenceToIdValue | ReferenceToClobValue | ReferenceToBlobValue |
 *     MapValue | BagValue | ArrayValue | SetValue | ObjectValue} itemType
 * @param {boolean} isNested
 * @returns {unknown[] | undefined}
 */
async function extractUnorderedListTypeFromMicrodata(
    CONTEXT: ParseContext,
    rule: RecipeRule,
    itemType: BagValue['item'] | SetValue['item'],
    isNested: boolean = false
): Promise<unknown[] | undefined> {
    let startStr;

    if (isNested) {
        startStr = '<ul>';
    } else {
        startStr = `<ul itemprop="${rule.itemprop}">`;
    }

    const endStr = '</ul>';
    const startStrLen = startStr.length;

    if (CONTEXT.html.startsWith(startStr + endStr)) {
        CONTEXT.position += (startStr + endStr).length;
        return [];
    }

    if (!CONTEXT.html.startsWith(startStr, CONTEXT.position)) {
        return undefined;
    }

    const endTagPosition = findClosingTag(CONTEXT.html, CONTEXT.position + startStrLen);

    const extractedValues = breakMicrodataIntoArray(
        CONTEXT.html,
        '<li>',
        '</li>',
        CONTEXT.position + startStrLen,
        endTagPosition
    );

    CONTEXT.position = endTagPosition + endStr.length;

    return await Promise.all(
        extractedValues.map(async li => {
            return await parseMicrodataByTheExpectedType(
                itemType,
                rule,
                // isolated CONTEXT
                {
                    ...CONTEXT,
                    position: 0,
                    html: li
                }
            );
        })
    );
}

/**
 * Extracts an object from the given microdata.
 * @param {ParseContext} CONTEXT
 * @param {RecipeRule} rule
 * @param {ObjectValue} valueType
 * @returns {unknown}
 */
async function extracObjectTypeFromMicrodata(
    CONTEXT: ParseContext,
    rule: RecipeRule,
    valueType: ObjectValue
): Promise<unknown> {
    let startStr;
    let endStr;

    if (CONTEXT.html.startsWith(`<span itemprop="${rule.itemprop}">`, CONTEXT.position)) {
        // if its standalone object, it has itemprop
        startStr = `<span itemprop="${rule.itemprop}"><ul>`;
        endStr = '</ul></span>';
    } else {
        startStr = '<ul>';
        endStr = '</ul>';
    }

    const startStrLen = startStr.length;

    if (
        CONTEXT.html.startsWith(
            // TODO Didn't I ask not to use ul for nested objects?
            `<span itemprop="${rule.itemprop}"><ul></ul></span>`,
            CONTEXT.position
        )
    ) {
        return {};
    }

    if (!CONTEXT.html.startsWith(startStr, CONTEXT.position)) {
        return undefined;
    }

    const endTagPosition = findClosingTag(CONTEXT.html, CONTEXT.position + startStrLen);

    const unWrappedMicrodata = CONTEXT.html.slice(startStr.length, endTagPosition);

    const extractedValues = [];
    let newStartPos = 0;

    while (newStartPos < unWrappedMicrodata.length) {
        const newEndPosition = extractMicrodataWithTag(unWrappedMicrodata, newStartPos);
        extractedValues.push(unWrappedMicrodata.slice(newEndPosition.start, newEndPosition.end));
        newStartPos = newEndPosition.end;
    }

    const obj: {[key: string]: unknown} = {};
    await Promise.all(
        extractedValues.map(async microdata => {
            for (const objectRule of valueType.rules) {
                if (
                    microdata.startsWith(
                        `itemprop="${objectRule.itemprop}">`,
                        microdata.indexOf('itemprop')
                    )
                ) {
                    obj[objectRule.itemprop] = parseMicrodataByTheExpectedType(
                        objectRule.itemtype,
                        objectRule,
                        // isolated CONTEXT
                        {
                            ...CONTEXT,
                            position: 0,
                            html: microdata
                        }
                    );
                    break;
                }
            }
        })
    );
    CONTEXT.position = endTagPosition + endStr.length;
    return obj;
}

/**
 * This is the opening tag of included CLOB and BLOB (base64 encoded) files. 64 is the hash
 * string length; the itemprop string length, here left empty, also needs to be
 * added ro get the full length.
 * @private
 * @type {number}
 */
const BLOB_START_LENGTH = '<span itemprop="" data-hash="">'.length + 64;

/**
 * Parses the base64 encoded string in an imploded ReferenceToBlob <span> tag and saves the BLOB.
 * @private
 * @param {ParseContext} CONTEXT
 * @param {RecipeRule} rule
 * @returns {Promise<*>}
 */
async function parseInnerBlob(CONTEXT: ParseContext, rule: RecipeRule): Promise<unknown> {
    if (
        !CONTEXT.html
            .substr(CONTEXT.position)
            .startsWith('<span itemprop="' + rule.itemprop + '" data-hash=')
    ) {
        return undefined;
    }

    const start = CONTEXT.position + BLOB_START_LENGTH + rule.itemprop.length;
    const end = CONTEXT.html.indexOf('<', start);

    const stream = createFileWriteStream('base64');

    stream.write(CONTEXT.html.slice(start, end));
    await stream.end();

    const writeResult = await stream.promise;
    CONTEXT.position = end + 7; // '</span>'.length
    return writeResult.hash;
}

/**
 * Parses the HTML-escaped string in an imploded ReferenceToClob <span> tag and saves the
 * UTF-8 CLOB.
 * @private
 * @param {ParseContext} CONTEXT
 * @param {RecipeRule} rule
 * @returns {Promise<*>}
 */
async function parseInnerClob(CONTEXT: ParseContext, rule: RecipeRule): Promise<unknown> {
    if (
        !CONTEXT.html
            .substr(CONTEXT.position)
            .startsWith('<span itemprop="' + rule.itemprop + '" data-hash=')
    ) {
        return undefined;
    }

    const start = CONTEXT.position + BLOB_START_LENGTH + rule.itemprop.length;
    const end = CONTEXT.html.indexOf('<', start);

    const writeResult = await storeUTF8Clob(unescapeFromHtml(CONTEXT.html.slice(start, end)));
    CONTEXT.position = end + 7; // '</span>'.length

    return writeResult.hash;
}

/**
 * @async
 * @param {ValueType} valueType
 * @param {RecipeRule} rule
 * @param {ParseContext} CONTEXT
 * @param {boolean} isNested
 * @returns {unknown}
 */
async function parseMicrodataByTheExpectedType(
    valueType: ValueType = {type: 'string'},
    rule: RecipeRule,
    CONTEXT: ParseContext,
    isNested: boolean = false
): Promise<unknown> {
    switch (valueType.type) {
        case 'string': {
            const valueStr = EXTRACT_FUNCTIONS.primitive(CONTEXT, rule, isNested);

            // This _must_ be a string value or somebody managed to sneak an impossible recipe
            // by our Recipe type check. In type-check's ensureRecipeRule() we ensure that a RegExp
            // can only be set for string properties, but we still check if it is a string because
            // the static type checker does not understand that larger context. Plus, extra safety.
            if (
                valueStr !== undefined &&
                valueType.regexp &&
                !new RegExp(
                    CONVERSION_FUNCTIONS.regexp(valueType.regexp as unknown as string)
                ).test(valueStr as string)
            ) {
                throw createError('M2O-PV3', {
                    value: valueStr,
                    regexp: CONVERSION_FUNCTIONS.regexp(valueType.regexp as unknown as string)
                });
            }

            return valueStr
                ? CONVERSION_FUNCTIONS[valueType.type || 'string'](valueStr as string)
                : valueStr;
        }
        case 'integer':
        case 'number':
        case 'boolean':
        case 'stringifiable': {
            const valueStr = EXTRACT_FUNCTIONS.primitive(CONTEXT, rule, isNested);

            return valueStr
                ? CONVERSION_FUNCTIONS[valueType.type || 'string'](valueStr as string)
                : valueStr;
        }
        case 'referenceToObj': {
            return await parseInnerObject(CONTEXT, valueType.allowedTypes, rule, HashLinkType.OBJ);
        }
        case 'referenceToId': {
            return await parseInnerObject(CONTEXT, valueType.allowedTypes, rule, HashLinkType.ID);
        }
        case 'referenceToClob': {
            return await parseInnerClob(CONTEXT, rule);
        }
        case 'referenceToBlob': {
            return await parseInnerBlob(CONTEXT, rule);
        }
        case 'array': {
            return await EXTRACT_FUNCTIONS.orderedCollection(
                CONTEXT,
                rule,
                valueType.item,
                isNested
            );
        }
        case 'map':
            return await EXTRACT_FUNCTIONS.map(CONTEXT, rule, valueType, isNested);
        case 'bag': {
            return await EXTRACT_FUNCTIONS.unorderedCollection(
                CONTEXT,
                rule,
                valueType.item,
                isNested
            );
        }
        case 'set': {
            return new Set(
                await EXTRACT_FUNCTIONS.unorderedCollection(CONTEXT, rule, valueType.item, isNested)
            );
        }
        case 'object':
            return await EXTRACT_FUNCTIONS.obj(CONTEXT, rule, valueType);
    }
}

/**
 * Parses only the inner data part of a ONE microdata object, i.e. the outer frame opening div
 * tag has already been parsed and the HTML starting at the given position only contains tags
 * with actual data.
 * This function continues until all rules are exhausted and the last rule ended up finding no
 * matching string (i.e. no matching data value in a data tag with the expected itemprop name).
 * @private
 * @param {OneObjectTypeNames} type - The type was already found by the caller. This function
 * expects it so that it can insert it into the returned object before any data properties so
 * that in the iteration order of Javascript objects implicitly set (for non-numerical
 * properties) through insertion order it gets the first spot. This is for human readers of raw
 * data output, the code does not care.
 * @param {ParseContext} CONTEXT
 * @param {RecipeRule[]} rules - An array of rules corresponding to all rules for a given ONE
 * object type from ONE object recipes
 * @returns {Promise<OneObjectTypes>}
 * @throws {Error}
 */
async function parseData(
    type: OneObjectTypeNames,
    CONTEXT: ParseContext,
    rules: readonly RecipeRule[]
): Promise<OneObjectTypes> {
    // The ONE object to add the parsed data to, pre-filled with the "type" property.
    // Type cast because this still needs to be filled with data properties
    const obj: Record<string, any> = {$type$: type};
    let value;

    for (const rule of rules) {
        const actualRule = resolveRuleInheritance(rule);

        value = await parseMicrodataByTheExpectedType(actualRule.itemtype, actualRule, CONTEXT);

        if (value === undefined && actualRule.optional !== true) {
            throw createError('MEX-PD1', {itemprop: actualRule.itemprop});
        }

        if (value !== undefined) {
            obj[actualRule.itemprop] = value;
        }
    }

    return obj as OneObjectTypes;
}

/**
 * @private
 * @param {ParseContext} CONTEXT
 * @param {undefined|Set<OneObjectTypeNames|"*">} [expectedType] - Expect certain type strings, or
 * '*' if we should accept any ONE object type that we have a recipe for. For included
 * sub-objects this is set to the Set object of the recipe rule, for top-level objects this is
 * set by the caller.
 * @returns {OneObjectTypeNames}
 * @throws {Error}
 */
function parseHeaderType(
    CONTEXT: ParseContext,
    expectedType: undefined | Set<OneObjectTypeNames | '*'>
): OneObjectTypeNames {
    // Find the type string: between the end of startStr and "> completing the tag
    const type = ensureValidTypeName(
        CONTEXT.html.slice(
            CONTEXT.position, // from
            CONTEXT.html.indexOf('">', CONTEXT.position) // to
        )
    );

    // Type check:
    // 1) For included sub-objects always using the Set from rule.referenceToObj
    // 2) For top-level objects only when the caller of the module-export convert function
    //    included a type string
    if (expectedType && !(expectedType.has(type) || expectedType.has('*'))) {
        throw createError('MEX-PHD1', {
            expected: Array.from(expectedType),
            type,
            html: CONTEXT.html
        });
    }

    CONTEXT.position += type.length + 2; // ">

    return type;
}

// The type is the last word in the URL value of attribute "itemtype":
// <div itemscope itemtype="//refin.io/[TYPE]">...</span>
const OUTER_HEADER_TYPE_START = '<div itemscope itemtype="//refin.io/';
const INNER_HEADER_TYPE_START = 'itemscope itemtype="//refin.io/';
const DATA_HASH_START = 'data-hash="';
const DATA_ID_HASH_START = ' data-id-hash="';

/**
 * @private
 * @param {ParseContext} CONTEXT
 * @returns {SHA256Hash}
 */
function parseHashAttribute(CONTEXT: ParseContext): SHA256Hash {
    const hashStart = CONTEXT.position + DATA_HASH_START.length;
    const hashEnd = hashStart + 64;

    if (
        !CONTEXT.html.startsWith(DATA_HASH_START, CONTEXT.position) ||
        CONTEXT.html.charAt(hashEnd) !== '"'
    ) {
        throw createError('MEX-PHA1', {position: CONTEXT.position, html: CONTEXT.html});
    }

    CONTEXT.position = hashEnd + 1; // '"'

    return ensureHash(CONTEXT.html.slice(hashStart, hashEnd));
}

/**
 * @private
 * @param {ParseContext} CONTEXT
 * @returns {(undefined | SHA256IdHash)}
 */
function parseIdHashAttribute(CONTEXT: ParseContext): undefined | SHA256IdHash {
    const hashStart = CONTEXT.position + DATA_ID_HASH_START.length;
    const hashEnd = hashStart + 64;

    if (
        !CONTEXT.html.startsWith(DATA_ID_HASH_START, CONTEXT.position) ||
        CONTEXT.html.charAt(hashEnd) !== '"'
    ) {
        return undefined;
    }

    CONTEXT.position = hashEnd + 1; // '"'

    return ensureIdHash(CONTEXT.html.slice(hashStart, hashEnd));
}

/**
 * @private
 * @param {ParseContext} CONTEXT
 * @param {Set<OneObjectTypeNames|"*">} expectedType - Expect certain type strings, or '*' if
 * we should accept any ONE object type that we have a recipe for. For included sub-objects this
 * is set to the Set object of the recipe rule, for top-level objects this is set by the caller.
 * @param {string} itemprop - This property is set for included objects assigned to a property
 * of a higher-level object.
 * @returns {(undefined | TypeAndHashAndIdHash)}
 * @throws {Error}
 */
function parseInnerHeader(
    CONTEXT: ParseContext,
    expectedType: undefined | Set<OneObjectTypeNames | '*'>,
    itemprop: string
): undefined | TypeAndHashAndIdHash {
    if (itemprop === undefined) {
        throw createError('MEX-PIH1');
    }

    // Outer <span> tag structure:
    // <span itemprop="..." data-hash="..." data-id-hash="..." itemscope
    //   itemtype="//refin.io/[TYPE]">...</span>
    const startStr = `<span itemprop="${itemprop}" `;

    if (!CONTEXT.html.startsWith(startStr, CONTEXT.position)) {
        // ...but if we expect an included object (when "itemprop" exists in the outer
        // span tag of the object) not finding an object is okay, since all properties are
        // optional (we don't check the mandatory ID-properties because A., we assume such an
        // object can't be written, and B., we return what we find and leave judgment to the
        // caller; be strict when writing data but be lenient when reading).
        return undefined;
    }

    CONTEXT.position += startStr.length;

    // Parse <span> tag attributes data-hash, data-id-hash only found in included objects. Those
    // are the values from the original Reference or ReferenceToId object, respectively.
    // However, even if there originally was a ReferenceToId object, which only has an ID hash
    // but no hash, there will be a concrete hash for the (one) actual object inserted for the
    // ID reference.
    const hash = parseHashAttribute(CONTEXT);
    const idHash = parseIdHashAttribute(CONTEXT);

    if (!CONTEXT.html.startsWith(OUTER_HEADER_TYPE_START, CONTEXT.position + 1)) {
        throw createError('MEX-PIH2', {
            expected: INNER_HEADER_TYPE_START,
            idHashEnd: CONTEXT.position,
            html: CONTEXT.html
        });
    }

    CONTEXT.position += OUTER_HEADER_TYPE_START.length + 1;

    const type = parseHeaderType(CONTEXT, expectedType);

    return [type, hash, idHash];
}

/**
 * This function only parses the opening enclosing "`itemscope`" span tag, extracts the type
 * string and optionally compares it to a given expected type, and advances the position counter
 * to the next character after the tag it was responsible for parsing.
 *
 * ### Example
 *
 * ```html
 * <div itemscope itemtype="//refin.io/Person">
 * ```
 *
 * leads to a return value of
 *
 * ```
 * {value: 'Person', position: 51}
 * ```
 *
 * **Note** that while `convertMicrodataToObject` takes a single string this function expects a
 * `Set` object to make it more flexible to accommodate `parseObject`, which also uses a `Set`
 * that it in turn receives from the recipe used to parse a *sub-*object.
 * @private
 * @param {ParseContext} CONTEXT
 * @param {Set<OneObjectTypeNames|"*">} [expectedType] - Expect certain type strings, or '*' if
 * we should accept any ONE object type that we have a recipe for. For included sub-objects this
 * is set to the Set object of the recipe rule, for top-level objects this is set by the caller.
 * @returns {OneObjectTypeNames}
 * @throws {Error}
 */
function parseHeader(
    CONTEXT: ParseContext,
    expectedType: undefined | Set<OneObjectTypeNames | '*'>
): OneObjectTypeNames {
    if (!CONTEXT.html.startsWith(OUTER_HEADER_TYPE_START, 0)) {
        // This is the outermost span tag, no excuses...
        throw createError('MEX-PH1', {html: CONTEXT.html});
    }

    CONTEXT.position += OUTER_HEADER_TYPE_START.length;

    return parseHeaderType(CONTEXT, expectedType);
}

/**
 * Parses an included ONE object inside an HTML microdata ONE object. Included objects always belong
 * to a property of their parent object. Since values on properties can be optional, and also
 * because while parsing an array of values we always eventually and expectedly run out of more
 * values (here: included ONE objects) to parse, this function may return `undefined` as value
 * when it does not find the expected object instead of throwing an error.
 *
 * Expected HTML:
 * ```
 * <span itemprop="..."...><div itemscope itemtype="//refin.io/...>...</div></span>
 * ```
 * @private
 * @param {ParseContext} CONTEXT
 * @param {Set<OneObjectTypeNames|"*">} expectedType - Expect certain type strings, or '*' if
 * we should accept any ONE object type that we have a recipe for. For included sub-objects this
 * is set to the Set object of the recipe rule, for top-level objects this is set by the caller.
 * @param {RecipeRule} [rule] - Included objects have a rule associated with the `itemprop`
 * property they are assigned to. The rule **should have inheritance already resolved** (property
 * `inheritFrom`), which should be the case because this parameter is only used by the
 * calling function parseMicrodataByTheExpectedType() which has passes its `rule` parameter through,
 * which has the same restriction/expectation.
 * @param {'obj' | 'id'} referenceTo
 * @returns {Promise<SHA256Hash | SHA256IdHash | undefined>}
 */
async function parseInnerObject(
    CONTEXT: ParseContext,
    expectedType: Set<OneObjectTypeNames | '*'>,
    rule: RecipeRule,
    referenceTo: Extract<'obj' | 'id', HashLinkTypeNames>
): Promise<SHA256Hash | SHA256IdHash | undefined> {
    const value = parseInnerHeader(CONTEXT, expectedType, rule.itemprop);

    // This happens when there is an array of included objects and the array ends, and a new
    // itemprop starts. Returning undefined and the original starting position tells the caller
    // to advance to the next item because all array items have been collected.
    if (value === undefined) {
        return undefined;
    }

    const [type, hash, idHash] = value;

    // We get the data-object of the ONE object and the position in the HTML microdata string
    // immediately after the last character of the last data item.
    const obj = await parseData(
        // "type" is sent so that it can be inserted right away to be at #1 position in the
        // "order" of properties (when iterating) defined by insertion order.
        type,
        CONTEXT,
        // New position: Immediately after the end of the outer span tag with the type, which
        // surrounds the actual data span tags.
        getRecipe(type).rule
    );

    if (!CONTEXT.html.startsWith(DIV_END, CONTEXT.position)) {
        throw createError('MEX-PIO1', {
            endStr: DIV_END,
            dataEnd: CONTEXT.position,
            html: CONTEXT.html
        });
    }

    // Advance to just after the </div> closing tag of the imploded object
    CONTEXT.position += DIV_END.length;

    // All inner objects are stored and then replaced by a reference object pointing to their
    // representation in storage.
    const result = isVersionedObject(obj)
        ? await storeVersionedObject(obj)
        : await storeUnversionedObject(obj);

    // Microdata attribute data-id-hash of the included object's outer span tag will be undefined
    // unless it was included through ReferenceToId.
    // The imploder could include the ID hash always, i.e. not just for ReferenceToId but also
    // for Reference if the referenced object is a versioned one, but that would be useless. If
    // the hash is correct there is no (non-theoretical) way that the ID hash is wrong. On the
    // other hand we always have a hash because a concrete object was sent.
    if (result.hash !== hash || (idHash !== undefined && result.idHash !== idHash)) {
        throw createError('MEX-PIO2', {hash, idHash, rHash: result.hash, rIdHash: result.idHash});
    }

    if (referenceTo === 'id' && idHash === undefined) {
        throw createError('MEX-PIO3', {type: obj.$type$});
    }

    const ref = referenceTo === 'id' ? idHash : hash;

    // Advance to the end of the </span> closing tag of the reference object property where the
    // referenced object was included (and just parsed), where normally there is just a hash
    if (CONTEXT.html.substr(CONTEXT.position, SPAN_END.length) !== SPAN_END) {
        throw createError('MEX-PIO4', {html: CONTEXT.html, position: CONTEXT.position});
    }

    CONTEXT.position += SPAN_END.length;

    return ref;
}

/**
 * Parses a complete html object including the outer frame that contains the type. There
 * are two parts: First the opening outer span tag is parsed for the type information, then the
 * html inside the outer frame is parsed for the data. The type string is used to find the
 * rule-set to use for parsing - the order of rules determines the order data properties are
 * expected. When the function is done there should be exactly the closing span tag of the outer
 * frame left unparsed (of the current object - if this is an inner/included object).
 *
 * - parseObject() is the top-level function to parse microdata into objects, determining the type
 *   and creating the object that is going to be returned.
 * - parseData() is the next-level function, parsing all the actual data of an object. It yields
 *   the object on the "data" property of the returned object.
 *
 * **Note** that while `convertMicrodataToObject` takes a single string this function expects a
 * `Set` object because it needs to be more flexible: It uses the recipe rule's `type` property
 * which is a `Set` object.
 * @private
 * @param {ParseContext} CONTEXT
 * @param {Set<OneObjectTypeNames|"*">} expectedType - Expect certain type strings, or '*' if
 * we should accept any ONE object type that we have a recipe for. For included sub-objects this
 * is set to the Set object of the recipe rule, for top-level objects this is set by the caller.
 * @returns {Promise<AnyObjectCreation>}
 * @throws {Error}
 */
async function parseObject(
    CONTEXT: ParseContext,
    expectedType: Set<OneObjectTypeNames | '*'>
): Promise<AnyObjectCreation> {
    const type = parseHeader(CONTEXT, expectedType);

    // We get the data-object of the ONE object and the position in the HTML microdata string
    // immediately after the last character of the last data item.
    const obj = await parseData(
        // "type" is sent so that it can be inserted right away to be at #1 position in the
        // "order" of properties (when iterating) defined by insertion order.
        type,
        CONTEXT,
        getRecipe(type).rule
    );

    // When the data parsing is done we should find the end-tag next and nothing else.
    const endStr = '</div>';

    if (!CONTEXT.html.startsWith(endStr, CONTEXT.position)) {
        throw createError('MEX-PO1', {endStr, dataEnd: CONTEXT.position, html: CONTEXT.html});
    }

    // The final step: If there is even a single character left that was not visited thus far
    // the string is not valid ONE microdata. This might be a newline added by an editor
    // automatically for whatever reason, but this would make the SHA-256 hash that the given
    // type and data are expected to have invalid.
    if (CONTEXT.position + endStr.length < CONTEXT.html.length) {
        throw createError('MEX-PO2', {nr: CONTEXT.html.length - CONTEXT.position + endStr.length});
    }

    return isVersionedObject(obj)
        ? await storeVersionedObject(obj)
        : await storeUnversionedObject(obj);
}

/**
 * Convert the microdata representation of a ONE object to Javascript using the rules in
 * object-recipes.js. An exception is thrown if there is a problem during the conversion.
 *
 * Parsing has been optimized to go through the microdata string only *once*. That means we
 * will proceed only forward and never look ahead, for example look for an end-tag and then go
 * back to parse what is in between.
 *
 * Another optimization is that the original HTML string is kept unaltered and no new strings
 * containing parts of the original string are created. Instead, we keep track of the
 * ever-advancing position that our parsing has reached. Each sub-function returns 1) its result
 * and 2) the new position within the original string that has been reached successfully.
 *
 * The only exception are - by necessity - the actual values gained from parsing the string.
 * We have to use one of the high-level Javascript methods (here: String.prototype.slice)
 * without knowing how it is implemented in the respective Javascript runtime (and version).
 *
 * While V8 (and probably other JS engines too) have an internal optimized representation of
 * sub-strings using pointers we don't want to rely on that. For some background see:
 *   http://mrale.ph/blog/2016/11/23/making-less-dart-faster.html
 *
 * If there is any discrepancy between what we expect and what we find the respective function
 * throws an exception immediately. This means the exception-free code path does not need any
 * checks, if the code continues to run we know everything is fine.
 * @static
 * @async
 * @param {string} html - One object in HTML (MicroData) representation
 * @param {(OneObjectTypeNames|OneObjectTypeNames[])} [expectedType] - An optional expected
 * type or an array of expected type names. If it is not matched by the microdata leads to an
 * `Error` when attempting to parse the microdata. Leaving this parameter undefined or
 * setting it to '*' disables the type check.
 * @returns {Promise<AnyObjectCreation>} Returns the result of storing the exploded object. All
 * inner objects were stored separately and replaced by Reference objects.
 * @throws {(Error|Error)}
 */
export async function explode(
    html: string,
    expectedType: OneObjectTypeNames | '*' | OneObjectTypeNames[] = '*'
): Promise<AnyObjectCreation> {
    if (html === undefined) {
        throw createError('MEX-PEXPL1');
    }

    const CONTEXT: ParseContext = {
        html,
        isIdObj: false,
        position: 0
    };

    return await parseObject(
        CONTEXT,
        new Set(Array.isArray(expectedType) ? expectedType : [expectedType])
    );
}