Extract text from HTML while preserving block-level element newlines

based on https://stackoverflow.com/a/20384452/3338098 and fixed to support TEXT1<div>TEXT2</div>=>TEXT1\nTEXT2 and allow non-DOM nodes

/**
 * Returns the style for a node.
 *
 * @param n The node to check.
 * @param p The property to retrieve (usually 'display').
 * @link http://www.quirksmode.org/dom/getstyles.html
 */
function getNodeStyle( n, p ) {
  return n.currentStyle ?
    n.currentStyle[p] :
    document.defaultView.getComputedStyle(n, null).getPropertyValue(p);
}

//IF THE NODE IS NOT ACTUALLY IN THE DOM then this won't take into account <div style="display: inline;">text</div>
//however for simple things like `contenteditable` this is sufficient, however for arbitrary html this will not work
function isNodeBlock(node) {
  if (node.nodeType == document.TEXT_NODE) {return false;}
  var d = getNodeStyle( node, 'display' );//this is irrelevant if the node isn't currently in the current DOM.
  if (d.match( /^block/ ) || d.match( /list/ ) || d.match( /row/ ) ||
      node.tagName == 'BR' || node.tagName == 'HR' ||
      node.tagName == 'DIV' // div,p,... add as needed to support non-DOM nodes
     ) {
    return true;
  }
  return false;
}

/**
 * Converts HTML to text, preserving semantic newlines for block-level
 * elements.
 *
 * @param node - The HTML node to perform text extraction.
 */
function htmlToText( htmlOrNode, isNode ) {
  var node = htmlOrNode;
  if (!isNode) {node = jQuery("<span>"+htmlOrNode+"</span>")[0];}
  //TODO: inject "unsafe" HTML into current DOM while guaranteeing that it won't
  //      change the visible DOM so that `isNodeBlock` will work reliably
  var result = '';
  if( node.nodeType == document.TEXT_NODE ) {
    // Replace repeated spaces, newlines, and tabs with a single space.
    result = node.nodeValue.replace( /\s+/g, ' ' );
  } else {
    for( var i = 0, j = node.childNodes.length; i < j; i++ ) {
      result += htmlToText( node.childNodes[i], true );
      if (i < j-1) {
        if (isNodeBlock(node.childNodes[i])) {
          result += '\n';
        } else if (isNodeBlock(node.childNodes[i+1]) &&
                   node.childNodes[i+1].tagName != 'BR' &&
                   node.childNodes[i+1].tagName != 'HR') {
          result += '\n';
        }
      }
    }
  }
  return result;
}

the main change was

      if (i < j-1) {
        if (isNodeBlock(node.childNodes[i])) {
          result += '\n';
        } else if (isNodeBlock(node.childNodes[i+1]) &&
                   node.childNodes[i+1].tagName != 'BR' &&
                   node.childNodes[i+1].tagName != 'HR') {
          result += '\n';
        }
      }

to check neighboring blocks to determine the appropriateness of adding a newline.


Consider:

/**
 * Returns the style for a node.
 *
 * @param n The node to check.
 * @param p The property to retrieve (usually 'display').
 * @link http://www.quirksmode.org/dom/getstyles.html
 */
this.getStyle = function( n, p ) {
  return n.currentStyle ?
    n.currentStyle[p] :
    document.defaultView.getComputedStyle(n, null).getPropertyValue(p);
}

/**
 * Converts HTML to text, preserving semantic newlines for block-level
 * elements.
 *
 * @param node - The HTML node to perform text extraction.
 */
this.toText = function( node ) {
  var result = '';

  if( node.nodeType == document.TEXT_NODE ) {
    // Replace repeated spaces, newlines, and tabs with a single space.
    result = node.nodeValue.replace( /\s+/g, ' ' );
  }
  else {
    for( var i = 0, j = node.childNodes.length; i < j; i++ ) {
      result += _this.toText( node.childNodes[i] );
    }

    var d = _this.getStyle( node, 'display' );

    if( d.match( /^block/ ) || d.match( /list/ ) || d.match( /row/ ) ||
        node.tagName == 'BR' || node.tagName == 'HR' ) {
      result += '\n';
    }
  }

  return result;
}

http://jsfiddle.net/3mzrV/2/

That is to say, with an exception or two, iterate through each node and print its contents, letting the browser's computed style tell you when to insert newlines.


This seems to be (nearly) doing what you want:

function getText($node) {
    return $node.contents().map(function () {
        if (this.nodeName === 'BR') {
            return '\n';
        } else if (this.nodeType === 3) {
            return this.nodeValue;
        } else {
            return getText($(this));
        }
    }).get().join('');
}

DEMO

It just recursively concatenates the values of all text nodes and replaces <br> elements with line breaks.

But there is no semantics in this, it completely relies the original HTML formatting (the leading and trailing white spaces seem to come from how jsFiddle embeds the HTML, but you can easily trim those). For example, notice how it indents the definition term.

If you really want to do this on a semantic level, you need a list of block level elements, recursively iterate over the elements and indent them accordingly. You treat different block elements differently with respect to indentation and line breaks around them. This should not be too difficult.