Source: microdata-to-json.ts

/**
 * @author Michael Hasenstein <hasenstein@yahoo.com>
 * @copyright REFINIO GmbH 2017
 * @license CC-BY-NC-SA-2.5; portions MIT License
 * @version 0.0.1
 */

/**
 * This module provides the same functionality as module microdata-to-object, but the result is
 * not a Javascript object but a JSON encoded string representation. The purpose of this module
 * is to serve as a shortcut, instead of converting a ONE object in microdata form to a
 * Javascript object and then JSON-stringify it this provides a more efficient path. Since no
 * objects have to be created, both input and output are strings, this probably also conserves
 * memory.
 * @module
 */

/* ***********************************************************************************************
 *
 * THIS MODULE IS ALMOST EXACTLY THE SAME AS microdata-to-object.js
 *
 * Main difference: The final product is a string and not an object
 *
 *
 * ********************************************************************************************* */

import {createError} from './errors';
import type {ParseContext} from './microdata-to-object';
import {parseData, parseHeader} from './microdata-to-object';
import {ensureValidTypeName, getRecipe} from './object-recipes';
import {stringify} from './util/sorted-stringify';
import {isString} from './util/type-checks-basic';

/**
 * Parses a complete html object including the outer frame that contains the type. There
 * are two parts: First the opening outer span tag is parsed for the type information, then the
 * html inside the outer frame is parsed for the data. The type string is used to find the
 * rule-set to use for parsing - the order of rules determines the order data properties are
 * expected. When the function is done there should be exactly the closing span tag of the outer
 * frame left unparsed (of the current object - if this is an inner/included object).
 *
 * - parseObject() it the top-level function to parse microdata into objects, determining the type
 *   and creating the object that is going to be returned.
 * - parseData() is the next-level function, parsing all the actual data of an object. It yields
 *   the object on the "data" property of the returned object.
 *
 * @private
 * @param {ParseContext} CONTEXT
 * @param {string} [itemprop] - This property is set for included objects assigned to a property
 * of a higher-level object.
 * @returns {(undefined|string)}
 * @throws {Error}
 */
function parseObject(CONTEXT: ParseContext, itemprop?: string): undefined | string {
    // The type is the last word in the URL value of attribute "itemtype":
    // <div itemscope itemtype="//refin.io/[TYPE]">...</span>
    const startStr = `<div${
        // The outermost frame enclosing the ONE object does not have an "itemprop" attribute,
        // but an included object does have this attribute in its outer frame.
        itemprop === undefined ? '' : ` itemprop="${itemprop}"`
    } itemscope itemtype="//refin.io/`; // +'">'

    // Example: <div itemscope itemtype="//refin.io/OneTest$Email">...inner html...</span>
    // returns {$type$:'OneTest$Email', html:[inner html]}
    if (!CONTEXT.html.startsWith(startStr, CONTEXT.position)) {
        // Only at the very beginning of an object do we make this distinction: If we expect an
        // included object (i.e. "itemprop" is a string) not finding an object is okay. However,
        // once we found the start of the expected object any further error truly is an exception.
        if (itemprop === undefined) {
            throw createError('M2J-PO1');
        }

        return undefined;
    }

    // Find the type string: between the end of startStr and "> completing the tag
    const expectedType = ensureValidTypeName(
        CONTEXT.html.slice(
            CONTEXT.position + startStr.length, // from
            CONTEXT.html.indexOf('">', CONTEXT.position + startStr.length) // to
        )
    );
    const type = parseHeader(
        new Set(Array.isArray(expectedType) ? expectedType : [expectedType]),
        CONTEXT
    );

    // We get the data-object of the ONE object and the position in the HTML microdata string
    // immediately after the last character of the last data item.
    const obj = parseData(
        // "type" is sent so that it can be inserted right away to be at #1 position in the
        // "order" of properties (when iterating) defined by insertion order.
        type,
        // New position: Immediately after the end of the outer span tag with the type, which
        // surrounds the actual data span tags.
        getRecipe(type).rule,
        CONTEXT
    );

    // New position: Immediately after the closing </span> of the (sub?-)object we just
    // finished parsing.
    CONTEXT.position = CONTEXT.html.lastIndexOf('</div>') + '</div>'.length;

    return stringify(obj);
}

/**
 * Convert the microdata representation of a ONE object directly to a JSON string. This is the
 * same parser as in microdata-to-object.js,
 *
 * @example
 *
 * const json = convertMicrodataToJSON(
 *   '<div itemscope itemtype="//refin.io/Person">' +
 *     '<span itemprop="email">winfried@mail.com</span>' +
 *   '</span>'
 * );
 * console.log(json);
 * // {"type":"Person","data":{"email":"winfried@mail.com"}}
 * @see
 * {@link microdata-to-object.module:ts.convertMicrodataToObject|microdata-to-object.convertMicrodataToObject}
 * @static
 * @param {string} html - One object in HTML (MicroData) representation
 * @returns {string} Returns the JSON string version of the parsed microdata
 * @throws {(Error|Error)}
 */
export function convertMicrodataToJSON(html: string): string {
    if (!isString(html)) {
        throw createError('M2J-CONV1');
    }

    const CONTEXT: ParseContext = {
        html,
        isIdObj: false,
        position: 0
    };

    const value = parseObject(CONTEXT);

    // The final step: If there is even a single character left that was not visited thus far
    // the string is not valid ONE microdata. This might be a newline added by an editor
    // automatically for whatever reason, but this would make the SHA-256 hash that the given
    // type and data are expected to have invalid.
    if (value === undefined) {
        throw createError('M2J-CONV2', {nr: html.length - CONTEXT.position});
    }

    return value;
}