refactored line converter, untied its logic from greentexting, better
handling of broken cases
This commit is contained in:
parent
e825021ef1
commit
bebafa1a2c
3 changed files with 67 additions and 15 deletions
|
@ -246,6 +246,7 @@ const getLinkData = (attrs, children, index) => {
|
||||||
*/
|
*/
|
||||||
export const preProcessPerLine = (html, greentext, handleLinks) => {
|
export const preProcessPerLine = (html, greentext, handleLinks) => {
|
||||||
const lastMentions = []
|
const lastMentions = []
|
||||||
|
const greentextHandle = new Set(['p', 'div'])
|
||||||
|
|
||||||
let nonEmptyIndex = -1
|
let nonEmptyIndex = -1
|
||||||
const newHtml = convertHtmlToLines(html).reverse().map((item, index, array) => {
|
const newHtml = convertHtmlToLines(html).reverse().map((item, index, array) => {
|
||||||
|
@ -256,7 +257,14 @@ export const preProcessPerLine = (html, greentext, handleLinks) => {
|
||||||
nonEmptyIndex += 1
|
nonEmptyIndex += 1
|
||||||
|
|
||||||
// Greentext stuff
|
// Greentext stuff
|
||||||
if (greentext && (string.includes('>') || string.includes('<'))) {
|
if (
|
||||||
|
// Only if greentext is engaged
|
||||||
|
greentext &&
|
||||||
|
// Only handle p's and divs. Don't want to affect blocquotes, code etc
|
||||||
|
item.level.every(l => greentextHandle.has(l)) &&
|
||||||
|
// Only if line begins with '>' or '<'
|
||||||
|
(string.includes('>') || string.includes('<'))
|
||||||
|
) {
|
||||||
const cleanedString = string.replace(/<[^>]+?>/gi, '') // remove all tags
|
const cleanedString = string.replace(/<[^>]+?>/gi, '') // remove all tags
|
||||||
.replace(/@\w+/gi, '') // remove mentions (even failed ones)
|
.replace(/@\w+/gi, '') // remove mentions (even failed ones)
|
||||||
.trim()
|
.trim()
|
||||||
|
|
|
@ -19,9 +19,42 @@ import { getTagName } from './utility.service.js'
|
||||||
* @return {(string|{ text: string })[]} processed html in form of a list.
|
* @return {(string|{ text: string })[]} processed html in form of a list.
|
||||||
*/
|
*/
|
||||||
export const convertHtmlToLines = (html) => {
|
export const convertHtmlToLines = (html) => {
|
||||||
const ignoredTags = new Set(['code', 'blockquote'])
|
// Elements that are implicitly self-closing
|
||||||
const handledTags = new Set(['p', 'br', 'div', 'pre', 'code', 'blockquote'])
|
// https://developer.mozilla.org/en-US/docs/Glossary/empty_element
|
||||||
const openCloseTags = new Set(['p', 'div', 'pre', 'code', 'blockquote'])
|
const emptyElements = new Set([
|
||||||
|
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
|
||||||
|
'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr'
|
||||||
|
])
|
||||||
|
// Block-level element (they make a visual line)
|
||||||
|
// https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
|
||||||
|
const blockElements = new Set([
|
||||||
|
'address', 'article', 'aside', 'blockquote', 'details', 'dialog', 'dd',
|
||||||
|
'div', 'dl', 'dt', 'fieldset', 'figcaption', 'figure', 'footer', 'form',
|
||||||
|
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li', 'main',
|
||||||
|
'nav', 'ol', 'p', 'pre', 'section', 'table', 'ul'
|
||||||
|
])
|
||||||
|
// br is very weird in a way that it's technically not block-level, it's
|
||||||
|
// essentially converted to a \n (or \r\n). There's also wbr but it doesn't
|
||||||
|
// guarantee linebreak, only suggest it.
|
||||||
|
const linebreakElements = new Set(['br'])
|
||||||
|
|
||||||
|
const visualLineElements = new Set([
|
||||||
|
...blockElements.values(),
|
||||||
|
...linebreakElements.values()
|
||||||
|
])
|
||||||
|
|
||||||
|
// All block-level elements that aren't empty elements, i.e. not <hr>
|
||||||
|
const nonEmptyElements = new Set(visualLineElements)
|
||||||
|
// Difference
|
||||||
|
for (let elem of emptyElements) {
|
||||||
|
nonEmptyElements.delete(elem)
|
||||||
|
}
|
||||||
|
|
||||||
|
// All elements that we are recognizing
|
||||||
|
const allElements = new Set([
|
||||||
|
...nonEmptyElements.values(),
|
||||||
|
...emptyElements.values()
|
||||||
|
])
|
||||||
|
|
||||||
let buffer = [] // Current output buffer
|
let buffer = [] // Current output buffer
|
||||||
const level = [] // How deep we are in tags and which tags were there
|
const level = [] // How deep we are in tags and which tags were there
|
||||||
|
@ -29,8 +62,8 @@ export const convertHtmlToLines = (html) => {
|
||||||
let tagBuffer = null // Current tag buffer, if null = we are not currently reading a tag
|
let tagBuffer = null // Current tag buffer, if null = we are not currently reading a tag
|
||||||
|
|
||||||
const flush = () => { // Processes current line buffer, adds it to output buffer and clears line buffer
|
const flush = () => { // Processes current line buffer, adds it to output buffer and clears line buffer
|
||||||
if (textBuffer.trim().length > 0 && !level.some(l => ignoredTags.has(l))) {
|
if (textBuffer.trim().length > 0) {
|
||||||
buffer.push({ text: textBuffer })
|
buffer.push({ level: [...level], text: textBuffer })
|
||||||
} else {
|
} else {
|
||||||
buffer.push(textBuffer)
|
buffer.push(textBuffer)
|
||||||
}
|
}
|
||||||
|
@ -49,10 +82,12 @@ export const convertHtmlToLines = (html) => {
|
||||||
}
|
}
|
||||||
|
|
||||||
const handleClose = (tag) => { // handles closing tags
|
const handleClose = (tag) => { // handles closing tags
|
||||||
flush()
|
|
||||||
buffer.push(tag)
|
|
||||||
if (level[0] === getTagName(tag)) {
|
if (level[0] === getTagName(tag)) {
|
||||||
|
flush()
|
||||||
|
buffer.push(tag)
|
||||||
level.shift()
|
level.shift()
|
||||||
|
} else { // Broken case
|
||||||
|
textBuffer += tag
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -67,10 +102,10 @@ export const convertHtmlToLines = (html) => {
|
||||||
const tagFull = tagBuffer
|
const tagFull = tagBuffer
|
||||||
tagBuffer = null
|
tagBuffer = null
|
||||||
const tagName = getTagName(tagFull)
|
const tagName = getTagName(tagFull)
|
||||||
if (handledTags.has(tagName)) {
|
if (allElements.has(tagName)) {
|
||||||
if (tagName === 'br') {
|
if (linebreakElements.has(tagName)) {
|
||||||
handleBr(tagFull)
|
handleBr(tagFull)
|
||||||
} else if (openCloseTags.has(tagName)) {
|
} else if (nonEmptyElements.has(tagName)) {
|
||||||
if (tagFull[1] === '/') {
|
if (tagFull[1] === '/') {
|
||||||
handleClose(tagFull)
|
handleClose(tagFull)
|
||||||
} else if (tagFull[tagFull.length - 2] === '/') {
|
} else if (tagFull[tagFull.length - 2] === '/') {
|
||||||
|
|
|
@ -1,8 +1,17 @@
|
||||||
import { convertHtmlToLines } from 'src/services/html_converter/html_line_converter.service.js'
|
import { convertHtmlToLines } from 'src/services/html_converter/html_line_converter.service.js'
|
||||||
|
|
||||||
const mapOnlyText = (processor) => (input) => input.text ? processor(input.text) : input
|
const greentextHandle = new Set(['p', 'div'])
|
||||||
|
const mapOnlyText = (processor) => (input) => {
|
||||||
|
if (input.text && input.level.every(l => greentextHandle.has(l))) {
|
||||||
|
return processor(input.text)
|
||||||
|
} else if (input.text) {
|
||||||
|
return input.text
|
||||||
|
} else {
|
||||||
|
return input
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
describe('html_line_converter', () => {
|
describe.only('html_line_converter', () => {
|
||||||
describe('with processor that keeps original line should not make any changes to HTML when', () => {
|
describe('with processor that keeps original line should not make any changes to HTML when', () => {
|
||||||
const processorKeep = (line) => line
|
const processorKeep = (line) => line
|
||||||
it('fed with regular HTML with newlines', () => {
|
it('fed with regular HTML with newlines', () => {
|
||||||
|
@ -81,7 +90,7 @@ describe('html_line_converter', () => {
|
||||||
|
|
||||||
it('fed with very broken HTML with broken composition', () => {
|
it('fed with very broken HTML with broken composition', () => {
|
||||||
const input = '</p> lmao what </div> whats going on <div> wha <p>'
|
const input = '</p> lmao what </div> whats going on <div> wha <p>'
|
||||||
const output = '</p>_</div>_<div>_<p>'
|
const output = '_<div>_<p>'
|
||||||
const result = convertHtmlToLines(input)
|
const result = convertHtmlToLines(input)
|
||||||
const comparableResult = result.map(mapOnlyText(processorReplace)).join('')
|
const comparableResult = result.map(mapOnlyText(processorReplace)).join('')
|
||||||
expect(comparableResult).to.eql(output)
|
expect(comparableResult).to.eql(output)
|
||||||
|
@ -111,7 +120,7 @@ describe('html_line_converter', () => {
|
||||||
expect(comparableResult).to.eql(output)
|
expect(comparableResult).to.eql(output)
|
||||||
})
|
})
|
||||||
|
|
||||||
it('fed with maybe valid HTML? self-closing divs and ps', () => {
|
it('fed with maybe valid HTML? (XHTML) self-closing divs and ps', () => {
|
||||||
const input = 'a <div class="what"/> what now <p aria-label="wtf"/> ?'
|
const input = 'a <div class="what"/> what now <p aria-label="wtf"/> ?'
|
||||||
const output = '_<div class="what"/>_<p aria-label="wtf"/>_'
|
const output = '_<div class="what"/>_<p aria-label="wtf"/>_'
|
||||||
const result = convertHtmlToLines(input)
|
const result = convertHtmlToLines(input)
|
||||||
|
|
Loading…
Reference in a new issue