Source: microdata-to-id-hash.ts

/**
 * @author Michael Hasenstein <hasenstein@yahoo.com>
 * @copyright REFINIO GmbH 2018
 * @license CC-BY-NC-SA-2.5; portions MIT License
 * @version 0.0.1
 */

/**
 * A ONE internal module to optimize the calculation of ID hashes from microdata without going
 * through a to-Javascript-object conversion.
 * @module
 */

import type {OneVersionedObjectInterfaces} from '@OneObjectInterfaces';

import {createError} from './errors';
import type {ParseContext} from './microdata-to-object';
import {findClosingTag, parseHeader} from './microdata-to-object';
import {getRecipe, isVersionedObjectType, resolveRuleInheritance} from './object-recipes';
import {getHashLinkTypeFromRule} from './object-to-microdata';
import type {OneObjectTypeNames, OneVersionedObjectTypeNames} from './recipes';
import {createCryptoHash} from './system/crypto-helpers';
import {readUTF8TextFile} from './system/storage-base';
import {ID_OBJECT_ATTR} from './util/object';
import type {SHA256Hash, SHA256IdHash} from './util/type-checks';

const ITEMPROP_START = '<span itemprop="';
const ITEMPROP_END = '</span>';

const REF_ITEMPROP_START = '<a itemprop="';
const REF_ITEMPROP_END = '</a>';

const ORDERED_ARRAY_ITEMPROP_START = '<ol itemprop="';
const ORDERED_ARRAY_ITEMPROP_END = '</ol>';

const UNORDERED_ARRAY_ITEMPROP_START = '<ul itemprop="';
const UNORDERED_ARRAY_ITEMPROP_END = '</ul>';

const MAP_ITEMPROP_START = '<dl itemprop="';
const MAP_ITEMPROP_END = '</dl>';

/**
 * This function takes a ONE object in microdata format, parses the type, and if it is a versioned
 * object builds the ID object microdata by extracting the ID properties into a new string and
 * adding the ID object attribute to the outer span.
 *
 * Assumption (enforced in type-checks.js function `ensureRecipeRuleObj`): all ID properties are
 * on the top level, never inside nested objects. Reminder: nested objects are part of the
 * object, unlike included objects, which have their own recipe, and which even when included in
 * imploded form don't influence the ID object of the parent, since they cannot be ID properties
 * (disallowed through the mentioned type-checks.js function).
 * @static
 * @param {string} microdata - The full microdata string
 * @param {(OneObjectTypeNames|OneObjectTypeNames[])} [expectedType] - An optional expected
 * type or an array of expected type names. If it is not matched by the microdata leads to an
 * `Error` when attempting to parse the microdata. Leaving this parameter undefined or
 * setting it to '*' disables the type check.
 * @returns {(undefined|string)} Returns the ONE object microdata string of the ID object or
 * `undefined` if the object is not a versioned object and no ID object can be built
 */
export function extractIdObject<T extends OneObjectTypeNames>(
    microdata: string,
    expectedType: T | '*' | T[] = '*'
): void | string {
    const parseContext: ParseContext = {
        html: microdata,
        isIdObj: false,
        position: 0
    };

    // This throws an error if there is no valid ONE microdata header (the rest is ignored)
    const type = parseHeader(
        new Set(Array.isArray(expectedType) ? expectedType : [expectedType]),
        parseContext
    );

    if (!isVersionedObjectType(type)) {
        return;
    }

    const recipe = getRecipe(type);

    // Outer ONE object <span> tag with the type information. Make it ID-object microdata by
    // inserting the virtual (never found in storage) attribute used to make hashes of ID
    // objects different from hashes of regular objects with (coincidentally) only ID properties
    let idMicrodata = `<div ${ID_OBJECT_ATTR} ${microdata.substring(5, parseContext.position)}`;

    let firstTagStart = parseContext.position;

    for (const rule of recipe.rule) {
        const actualRule = resolveRuleInheritance(rule);
        const usesLinkTag = actualRule.itemtype
            ? getHashLinkTypeFromRule(actualRule.itemtype.type) !== undefined
            : false;

        // Not "actualRule" but "rule" - inherited isId is ignored, ID properties must be
        // defined in the actual recipe and cannot be inherited.
        if (rule.isId) {
            // The match string we can use must end at the itemprop name boundary and nothing
            // more. That is because Reference and ReferenceToId objects can be ID properties,
            // and they have a full outer span tag including "itemscope" and "itemtype", which
            // itemprop <span> tags that are just regular values do not have.
            // The search string therefore is: '<span itemprop="..."'
            let itempropStart = '';
            let itempropEnd = '';
            const valueType = rule.itemtype ? rule.itemtype : {type: 'string', string: {}};

            if (valueType.type === 'bag' || valueType.type === 'set') {
                itempropStart = UNORDERED_ARRAY_ITEMPROP_START;
                itempropEnd = UNORDERED_ARRAY_ITEMPROP_END;
            } else if (valueType.type === 'array') {
                itempropStart = ORDERED_ARRAY_ITEMPROP_START;
                itempropEnd = ORDERED_ARRAY_ITEMPROP_END;
            } else if (valueType.type === 'map') {
                itempropStart = MAP_ITEMPROP_START;
                itempropEnd = MAP_ITEMPROP_END;
            } else if (usesLinkTag) {
                itempropStart = REF_ITEMPROP_START;
                itempropEnd = REF_ITEMPROP_END;
            } else {
                itempropStart = ITEMPROP_START;
                itempropEnd = ITEMPROP_END;
            }

            const matchStr = itempropStart + actualRule.itemprop + '"';

            // Two cases: The current itemprop holds...
            //  - a single value
            //  - an array of values
            // In case 1 the tag we find as a start and its corresponding end tag are all we
            // need. If we have an array there possibly are many (but at least one) such tag. If
            // we look for the last one the area between the first and the last are all values
            // for this same itemprop, due to how ONE microdata is created.
            // ID properties can be included objects like "Reference" ONE objects, which means
            // there are <span> tags nested inside the opening and closing <span> tags. Our
            // algorithm does not need special treatment to find either one since findEndTag()
            // counts opening and closing tags (presuming correct ONE object microdata).

            const foundPos = microdata.indexOf(matchStr, firstTagStart);

            if (foundPos === -1) {
                if (rule.optional === true) {
                    continue;
                }

                // ID properties are "must have" and never optional.
                throw createError('M2IH-XID1', {itemprop: actualRule.itemprop, microdata});
            }

            firstTagStart = foundPos;

            // About 2nd parameter:
            // We have to advance the starting position past the "<" of the current <span ...> tag
            // because that is what findClosingTag() searches for. How far we go (without going past
            // the end of the current tag!) is a matter of "nano-optimization", +1 is the minimum.
            // We skip the match string which still is safe.
            const endTagPosition = findClosingTag(microdata, firstTagStart + matchStr.length);

            // <span itemprop="idPropertyA"...>.....</span> (one or many concatenated)
            // substring()'s second parameter is the index of the first character to EXCLUDE from
            // the returned substring.
            idMicrodata += microdata.substring(firstTagStart, endTagPosition + itempropEnd.length);
        }
    }

    return idMicrodata + '</div>';
}

/**
 * The function calculates the ID hash for a given hash. If the object is not a versioned
 * object `undefined` is returned.
 *
 * This method performs no validity checks on the microdata. It presumes that the microdata
 * represents a valid ONE object.
 *
 * However, if the header itself cannot be parsed an Error is thrown. An error is also thrown if
 * for a detected opening <span> tag no matching closing tag can be found, and an `Error` is
 * thrown if there is an `expectedType` parameter and the type found in the header is not a match.
 * @static
 * @async
 * @param {SHA256Hash} hash
 * @param {(OneVersionedObjectTypeNames|OneVersionedObjectTypeNames[]|'*')} [expectedType='*'] - An
 * optional expected type or an array of expected type names. If it is not matched by the
 * microdata leads to an `Error` when attempting to parse the microdata. Leaving this parameter
 * undefined or setting it to '*' disables the type check.
 * @returns {Promise<undefined|SHA256Hash>} Returns undefined if the hash points to an
 * unversioned object, or the SHA-256 of the ID object of the object identified by the given
 * hash if it is a versioned object.
 */
export async function calculateIdHashForStoredObj<T extends OneVersionedObjectTypeNames>(
    hash: SHA256Hash<OneVersionedObjectInterfaces[T]>,
    expectedType: T | '*' | T[] = '*'
): Promise<undefined | SHA256IdHash<OneVersionedObjectInterfaces[T]>> {
    const microdata = await readUTF8TextFile(hash);

    if (microdata.startsWith(`<div ${ID_OBJECT_ATTR}`)) {
        return hash as unknown as SHA256IdHash<OneVersionedObjectInterfaces[T]>;
    }

    const idObjectMicrodata = extractIdObject(microdata, expectedType);

    if (idObjectMicrodata === undefined) {
        // Unversioned object type
        return;
    }

    return (await createCryptoHash(idObjectMicrodata)) as unknown as SHA256IdHash<
        OneVersionedObjectInterfaces[T]
    >;
}