Source: microdata-to-id-hash.ts

  1. /**
  2. * @author Michael Hasenstein <hasenstein@yahoo.com>
  3. * @copyright REFINIO GmbH 2018
  4. * @license CC-BY-NC-SA-2.5; portions MIT License
  5. * @version 0.0.1
  6. */
  7. /**
  8. * A ONE internal module to optimize the calculation of ID hashes from microdata without going
  9. * through a to-Javascript-object conversion.
  10. * @module
  11. */
  12. import type {OneVersionedObjectInterfaces} from '@OneObjectInterfaces';
  13. import {createError} from './errors';
  14. import type {ParseContext} from './microdata-to-object';
  15. import {findClosingTag, parseHeader} from './microdata-to-object';
  16. import {getRecipe, isVersionedObjectType, resolveRuleInheritance} from './object-recipes';
  17. import {getHashLinkTypeFromRule} from './object-to-microdata';
  18. import type {OneObjectTypeNames, OneVersionedObjectTypeNames} from './recipes';
  19. import {createCryptoHash} from './system/crypto-helpers';
  20. import {readUTF8TextFile} from './system/storage-base';
  21. import {ID_OBJECT_ATTR} from './util/object';
  22. import type {SHA256Hash, SHA256IdHash} from './util/type-checks';
  23. const ITEMPROP_START = '<span itemprop="';
  24. const ITEMPROP_END = '</span>';
  25. const REF_ITEMPROP_START = '<a itemprop="';
  26. const REF_ITEMPROP_END = '</a>';
  27. const ORDERED_ARRAY_ITEMPROP_START = '<ol itemprop="';
  28. const ORDERED_ARRAY_ITEMPROP_END = '</ol>';
  29. const UNORDERED_ARRAY_ITEMPROP_START = '<ul itemprop="';
  30. const UNORDERED_ARRAY_ITEMPROP_END = '</ul>';
  31. const MAP_ITEMPROP_START = '<dl itemprop="';
  32. const MAP_ITEMPROP_END = '</dl>';
  33. /**
  34. * This function takes a ONE object in microdata format, parses the type, and if it is a versioned
  35. * object builds the ID object microdata by extracting the ID properties into a new string and
  36. * adding the ID object attribute to the outer span.
  37. *
  38. * Assumption (enforced in type-checks.js function `ensureRecipeRuleObj`): all ID properties are
  39. * on the top level, never inside nested objects. Reminder: nested objects are part of the
  40. * object, unlike included objects, which have their own recipe, and which even when included in
  41. * imploded form don't influence the ID object of the parent, since they cannot be ID properties
  42. * (disallowed through the mentioned type-checks.js function).
  43. * @static
  44. * @param {string} microdata - The full microdata string
  45. * @param {(OneObjectTypeNames|OneObjectTypeNames[])} [expectedType] - An optional expected
  46. * type or an array of expected type names. If it is not matched by the microdata leads to an
  47. * `Error` when attempting to parse the microdata. Leaving this parameter undefined or
  48. * setting it to '*' disables the type check.
  49. * @returns {(undefined|string)} Returns the ONE object microdata string of the ID object or
  50. * `undefined` if the object is not a versioned object and no ID object can be built
  51. */
  52. export function extractIdObject<T extends OneObjectTypeNames>(
  53. microdata: string,
  54. expectedType: T | '*' | T[] = '*'
  55. ): void | string {
  56. const parseContext: ParseContext = {
  57. html: microdata,
  58. isIdObj: false,
  59. position: 0
  60. };
  61. // This throws an error if there is no valid ONE microdata header (the rest is ignored)
  62. const type = parseHeader(
  63. new Set(Array.isArray(expectedType) ? expectedType : [expectedType]),
  64. parseContext
  65. );
  66. if (!isVersionedObjectType(type)) {
  67. return;
  68. }
  69. const recipe = getRecipe(type);
  70. // Outer ONE object <span> tag with the type information. Make it ID-object microdata by
  71. // inserting the virtual (never found in storage) attribute used to make hashes of ID
  72. // objects different from hashes of regular objects with (coincidentally) only ID properties
  73. let idMicrodata = `<div ${ID_OBJECT_ATTR} ${microdata.substring(5, parseContext.position)}`;
  74. let firstTagStart = parseContext.position;
  75. for (const rule of recipe.rule) {
  76. const actualRule = resolveRuleInheritance(rule);
  77. const usesLinkTag = actualRule.itemtype
  78. ? getHashLinkTypeFromRule(actualRule.itemtype.type) !== undefined
  79. : false;
  80. // Not "actualRule" but "rule" - inherited isId is ignored, ID properties must be
  81. // defined in the actual recipe and cannot be inherited.
  82. if (rule.isId) {
  83. // The match string we can use must end at the itemprop name boundary and nothing
  84. // more. That is because Reference and ReferenceToId objects can be ID properties,
  85. // and they have a full outer span tag including "itemscope" and "itemtype", which
  86. // itemprop <span> tags that are just regular values do not have.
  87. // The search string therefore is: '<span itemprop="..."'
  88. let itempropStart = '';
  89. let itempropEnd = '';
  90. const valueType = rule.itemtype ? rule.itemtype : {type: 'string', string: {}};
  91. if (valueType.type === 'bag' || valueType.type === 'set') {
  92. itempropStart = UNORDERED_ARRAY_ITEMPROP_START;
  93. itempropEnd = UNORDERED_ARRAY_ITEMPROP_END;
  94. } else if (valueType.type === 'array') {
  95. itempropStart = ORDERED_ARRAY_ITEMPROP_START;
  96. itempropEnd = ORDERED_ARRAY_ITEMPROP_END;
  97. } else if (valueType.type === 'map') {
  98. itempropStart = MAP_ITEMPROP_START;
  99. itempropEnd = MAP_ITEMPROP_END;
  100. } else if (usesLinkTag) {
  101. itempropStart = REF_ITEMPROP_START;
  102. itempropEnd = REF_ITEMPROP_END;
  103. } else {
  104. itempropStart = ITEMPROP_START;
  105. itempropEnd = ITEMPROP_END;
  106. }
  107. const matchStr = itempropStart + actualRule.itemprop + '"';
  108. // Two cases: The current itemprop holds...
  109. // - a single value
  110. // - an array of values
  111. // In case 1 the tag we find as a start and its corresponding end tag are all we
  112. // need. If we have an array there possibly are many (but at least one) such tag. If
  113. // we look for the last one the area between the first and the last are all values
  114. // for this same itemprop, due to how ONE microdata is created.
  115. // ID properties can be included objects like "Reference" ONE objects, which means
  116. // there are <span> tags nested inside the opening and closing <span> tags. Our
  117. // algorithm does not need special treatment to find either one since findEndTag()
  118. // counts opening and closing tags (presuming correct ONE object microdata).
  119. const foundPos = microdata.indexOf(matchStr, firstTagStart);
  120. if (foundPos === -1) {
  121. if (rule.optional === true) {
  122. continue;
  123. }
  124. // ID properties are "must have" and never optional.
  125. throw createError('M2IH-XID1', {itemprop: actualRule.itemprop, microdata});
  126. }
  127. firstTagStart = foundPos;
  128. // About 2nd parameter:
  129. // We have to advance the starting position past the "<" of the current <span ...> tag
  130. // because that is what findClosingTag() searches for. How far we go (without going past
  131. // the end of the current tag!) is a matter of "nano-optimization", +1 is the minimum.
  132. // We skip the match string which still is safe.
  133. const endTagPosition = findClosingTag(microdata, firstTagStart + matchStr.length);
  134. // <span itemprop="idPropertyA"...>.....</span> (one or many concatenated)
  135. // substring()'s second parameter is the index of the first character to EXCLUDE from
  136. // the returned substring.
  137. idMicrodata += microdata.substring(firstTagStart, endTagPosition + itempropEnd.length);
  138. }
  139. }
  140. return idMicrodata + '</div>';
  141. }
  142. /**
  143. * The function calculates the ID hash for a given hash. If the object is not a versioned
  144. * object `undefined` is returned.
  145. *
  146. * This method performs no validity checks on the microdata. It presumes that the microdata
  147. * represents a valid ONE object.
  148. *
  149. * However, if the header itself cannot be parsed an Error is thrown. An error is also thrown if
  150. * for a detected opening <span> tag no matching closing tag can be found, and an `Error` is
  151. * thrown if there is an `expectedType` parameter and the type found in the header is not a match.
  152. * @static
  153. * @async
  154. * @param {SHA256Hash} hash
  155. * @param {(OneVersionedObjectTypeNames|OneVersionedObjectTypeNames[]|'*')} [expectedType='*'] - An
  156. * optional expected type or an array of expected type names. If it is not matched by the
  157. * microdata leads to an `Error` when attempting to parse the microdata. Leaving this parameter
  158. * undefined or setting it to '*' disables the type check.
  159. * @returns {Promise<undefined|SHA256Hash>} Returns undefined if the hash points to an
  160. * unversioned object, or the SHA-256 of the ID object of the object identified by the given
  161. * hash if it is a versioned object.
  162. */
  163. export async function calculateIdHashForStoredObj<T extends OneVersionedObjectTypeNames>(
  164. hash: SHA256Hash<OneVersionedObjectInterfaces[T]>,
  165. expectedType: T | '*' | T[] = '*'
  166. ): Promise<undefined | SHA256IdHash<OneVersionedObjectInterfaces[T]>> {
  167. const microdata = await readUTF8TextFile(hash);
  168. if (microdata.startsWith(`<div ${ID_OBJECT_ATTR}`)) {
  169. return hash as unknown as SHA256IdHash<OneVersionedObjectInterfaces[T]>;
  170. }
  171. const idObjectMicrodata = extractIdObject(microdata, expectedType);
  172. if (idObjectMicrodata === undefined) {
  173. // Unversioned object type
  174. return;
  175. }
  176. return (await createCryptoHash(idObjectMicrodata)) as unknown as SHA256IdHash<
  177. OneVersionedObjectInterfaces[T]
  178. >;
  179. }