Split string to array of strings with 1-3 words depends on length

You can express your rules as abbreviated regular expressions, build a real regex from them and apply it to your input:

text = "Lorem ipsum, dolor. sit amet? consectetur,   adipiscing,  elit! sed doeiusmod tempor incididunt ut Duis aute irure dolor in reprehenderit in esse cillum dolor eu fugia bla?";

rules = ['(SSS)', '(SS(?=L))', '(L(?=L))', '(SL)', '(LS)', '(.+)']

regex = new RegExp(
    rules
        .join('|')
        .replace(/S/g, '\\w{1,5}\\W+')
        .replace(/L/g, '\\w{6,}\\W+')
    , 'g')

console.log(text.match(regex))

If the rules don't change, the regex construction part is only needed once.

Note that this also handles punctuation in a reasonable way.


One option is to first create an array of rules, like:

const rules = [
  // [# of words to splice if all conditions met, condition for word1, condition for word2, condition for word3...]
  [3, 'less', 'less', 'less'],
  // the above means: splice 3 words if the next 3 words' lengths are <6, <6, <6
  [2, 'less', 'less', 'eqmore'],
  // the above means: splice 2 words if the next 3 words' lengths are <6, <6, >=6
  [1, 'eqmore', 'eqmore'],
  [2, 'eqmore', 'less'],
  [2, 'less', 'eqmore']
];

Then iterate through the array of rules, finding the rule that matches, extracting the appropriate number of words to splice from the matching rule, and push to the output array:

    const rules = [
      [3, 'less', 'less', 'less'],
      [2, 'less', 'less', 'eqmore'],
      [1, 'eqmore', 'eqmore'],
      [2, 'eqmore', 'less'],
      [2, 'less', 'eqmore']
    ];
const s = "Lorem ipsum dolor sit amet consectetur adipiscing elit sed doeiusmod tempor incididunt ut Duis aute irure dolor in reprehenderit in esse cillum dolor eu fugia";

const words = s.split(' ');
const output = [];
const verify = (cond, word) => cond === 'less' ? word.length < 6 : word.length >= 6;
while (words.length) {
  const [wordCount] = rules.find(
    ([wordCount, ...conds]) => conds.every((cond, i) => verify(cond, words[i]))
  );
  output.push(words.splice(0, wordCount).join(' '));
}
console.log(output);

Of course, the .find assumes that every input string will always have a matching rule for each position spliced.

For the additional rule that any words not matched by the previous rules just be added to the output, put [1] into the bottom of the rules array:

const rules = [
      [3, 'less', 'less', 'less'],
      [2, 'less', 'less', 'eqmore'],
      [1, 'eqmore', 'eqmore'],
      [2, 'eqmore', 'less'],
      [2, 'less', 'eqmore'],
      [1]
    ];
const s = "Lorem ipsum dolor sit amet consectetur adipiscing elit sed doeiusmod tempor incididunt ut Duis aute irure dolor in reprehenderit in esse cillum dolor eu fugia";

const words = s.split(' ');
const output = [];
const verify = (cond, word) => cond === 'less' ? word.length < 6 : word.length >= 6;
while (words.length) {
  const [wordCount] = rules.find(
    ([wordCount, ...conds]) => conds.every((cond, i) => words[i] && verify(cond, words[i]))
  );
  output.push(words.splice(0, wordCount).join(' '));
}
console.log(output);


If we define words with length <6 to have size 1 and >=6 to have size 2, we can rewrite the rules to "if the next word would make the total size of the current row >= 4, start next line".

function wordSize(word) {
  if (word.length < 6) 
    return 1;
  return 2;
}
let s = "Lorem ipsum dolor sit amet consectetur adipiscing elit sed doeiusd tempor incididunt ut Duis aute irure dolor in reprehenderit in esse cillum dolor eu fugia";
var result = [];
var words = s.split(" ");
var row = [];
for (var i = 0; i < words.length; ++i) {
  if (row.reduce((s, w) => s + wordSize(w), 0) + wordSize(words[i]) >= 4) {
    result.push(row);
    row = [];
  }
  row.push(words[i]);
}
result.push(row);
result = result.map(a => a.join(" "));
console.log(result);

Tags:

Javascript