import { longestFirstSortFn, PUNCTUATION_CHARSET_STRING, regExpEscape, throw_up, } from "./util.js" import { dancers } from "./chooser.js" import { moves } from "./define-figure.js" export function Words(arr) { this.arr = arr } export const words = function () { return new Words(Array.prototype.slice.call(arguments)) } Words.prototype.scrunched = function() { return false } // ____ ScrunchedWords are Words that don't insert spaces // between arguments as agressively as regular words function ScrunchedWords(arr) { Words.call(this, arr) } ScrunchedWords.prototype = Object.create(Words.prototype) ScrunchedWords.prototype.constructor = ScrunchedWords ScrunchedWords.prototype.scrunched = function() { return true } // ____ ScrunchedWords end var wants_no_space_before = [false, null, ",", ".", ";"] export const FLATTEN_FORMAT_MARKDOWN = 1001 export const FLATTEN_FORMAT_HTML = 1002 export const FLATTEN_FORMAT_UNSAFE_TEXT = 1003 export const FLATTEN_FORMAT_SAFE_TEXT = 1004 // returns *sanitized* html Words.prototype.toHtml = function() { return this.flatten(FLATTEN_FORMAT_HTML) } // returns *unsanitized* string with Tags (see below) lobotimized into text. // May do whitespace dialation ala html, but at least it preserves newlines. Words.prototype.toUnsafeText = function() { return this.flatten(FLATTEN_FORMAT_UNSAFE_TEXT) } // returns *sanitized* e.g. pb&j → <b>pb&a<b> // but is different from toHtml because any Tags (see below) are // flattened with no bracketing tags, they're all body. Words.prototype.toSafeText = function() { return this.flatten(FLATTEN_FORMAT_SAFE_TEXT) } // returns *unsanitized* string with Tags (see below) rendered as html (not Markdown) // This preserves newlines, unlike toHtml. // It's assumed to be headed for a markdown parser that takes care of that. Words.prototype.toMarkdown = function() { return this.flatten(FLATTEN_FORMAT_MARKDOWN) } Words.prototype.flatten = function(format) { var arr = this.arr var acc = [] var space_before = false for (var i = 0; i < arr.length; i++) { var wants_space_before = !this.scrunched() && -1 === wants_no_space_before.indexOf(peek(arr[i])) if (wants_space_before) { acc.push(" ") } acc.push(flattenWordNode(arr[i], format)) } return trimButLeaveNewlines(acc.join("")) } Words.prototype.peek = function() { for (var i = 0; i < this.arr.length; i++) { var p = peek(this.arr[i]) if (p) { return p } } return null } export const Tag = function(tag, attrs, body) { this.tag = tag this.attrs = attrs this.body = body } export const tag = function(tag, body) { return new Tag(tag, {}, body) } // function tag_attrs(tag, attrs, body) { // return new Tag(tag, attrs, body); // } Tag.prototype.flatten = function(format) { if ( format === FLATTEN_FORMAT_UNSAFE_TEXT || format === FLATTEN_FORMAT_SAFE_TEXT ) { return flattenWordNode(this.body, format) } else if ( format === FLATTEN_FORMAT_MARKDOWN || format === FLATTEN_FORMAT_HTML ) { Object.keys(this.attrs).length === 0 || throw_up("attrs not yet implemented, but used") return ( "<" + this.tag + ">" + flattenWordNode(this.body, format) + "" ) } else { throw_up("unexpected word flatten format :" + format.toString()) } } Tag.prototype.peek = function() { return peek(this.body) } var sanitizationMap = { "<": "<", ">": ">", "&": "&", "&": "&", } function flattenWordNode(s, format) { if (s.flatten) { return s.flatten(format) } else if ("string" === typeof s) { if (format === FLATTEN_FORMAT_HTML || format === FLATTEN_FORMAT_SAFE_TEXT) { var replacer = function(match) { return ( sanitizationMap[match] || throw_up("Unexpected match during flatten sanitize") ) } return s.replace(/&|&|<|>/g, replacer) } else if ( format === FLATTEN_FORMAT_MARKDOWN || format === FLATTEN_FORMAT_UNSAFE_TEXT ) { return s } else { throw_up("unexpected flatten format: " + format.toString()) } } else if (comma === s) { return "," } else if (false === s) { return "" } else { return "" + s } } // returns first non-whitespace character export const peek = function(thing) { var m if (thing.peek) { return thing.peek() } else if (typeof thing === "string" && (m = thing.match(/[\S\n]/))) { return m[0] } else if (thing == comma) { return "," } else if (thing == false) { return null } else { return null } } export const trimButLeaveNewlines = function(s) { var start var end for (start = 0; start < s.length; start++) { if (s[start].match(/[\S\n]/)) { break } } for (end = s.length - 1; end >= start; end--) { if (s[end].match(/[\S\n]/)) { break } } return s.slice(start, end + 1) } //////////////////////////////////////////////////////////////// export const comma = Object.freeze([Object.freeze("comma")]) //////////////////////////////////////////////////////////////// // clamp words in or - string is already in-dialect at the point when this proceess it export const lingoLineWords = function(string, dialect) { // lookbehind doesn't work in all versions of js, so we've got to use capture groups for word boundaries, sigh var underlines_and_strikes = underlinesAndStrikes(dialect) var all_lingo_lines = underlines_and_strikes.underlines .concat(underlines_and_strikes.strikes) .sort(longestFirstSortFn) var regex = new RegExp( "(\\s|" + PUNCTUATION_CHARSET_STRING + "|^)(" + all_lingo_lines.map(regExpEscape).join("|") + ")(\\s|" + PUNCTUATION_CHARSET_STRING + "|$)", "ig" ) var buffer = [] var last_match_ended_at while (true) { last_match_ended_at = regex.lastIndex var match_info = regex.exec(string) if (!match_info) break buffer.push(string.slice(last_match_ended_at, match_info.index)) var is_strike = underlines_and_strikes.strikes.indexOf(match_info[2].toLowerCase()) >= 0 buffer.push(match_info[1]) // its whitespace, but it might be a newline buffer.push(tag(is_strike ? "s" : "u", match_info[2])) regex.lastIndex = regex.lastIndex - match_info[3].length // put back trailing whitespace } buffer.push(string.slice(last_match_ended_at)) return new ScrunchedWords(buffer) } var bogusTerms = [ "men", "women", "man", "woman", "gentlemen", "gentleman", "gents", "gent", "ladies", "lady", "leads", "lead", "follows", "follow", "larks", "lark", "ravens", "raven", "sex", "gypsy", "yearn", "rory o'moore", "rollaway", "roll-away", "nn", "n ", "p ", "l ", "g ", "m ", "w ", "n.", "p.", "l.", "g.", "m.", "w.", "g1", "g2", "l1", "l2", ] var terms_for_uands // NB on return value: it is freshly allocated each time function underlinesAndStrikes(dialect) { if (!terms_for_uands) { terms_for_uands = moves().concat(dancers()) } var underlines = terms_for_uands.map(function(term) { var substitution = dialect.dancers[term] || dialect.moves[term] return (substitution ? stripPercentS(substitution) : term).toLowerCase() }) var strikes = terms_for_uands.concat(bogusTerms).filter(function(s) { return -1 === underlines.indexOf(s.toLowerCase()) }) return { underlines: underlines, strikes: strikes } } function stripPercentS(str) { return str.replace(/%S/g, "").trim() }