Compounding english

C++, final word length: 38272

(optimised version took about 20 minutes)

#include <iostream>
#include <string>
#include <vector>

std::size_t calcOverlap(const std::string &a, const std::string &b, std::size_t limit, std::size_t minimal) {
    std::size_t la = a.size();
    for(std::size_t p = std::min(std::min(la, b.size()), limit + 1); -- p > minimal; ) {
        if(a.compare(la - p, p, b, 0, p) == 0) {
            return p;
        }
    }
    return 0;
}

int main() {
    std::vector<std::string> words;

    // Load all words from input
    while(true) {
        std::string word;
        std::getline(std::cin, word);
        if(word.empty()) {
            break;
        }
        words.push_back(word);
    }

    std::cerr
        << "Input word count: " << words.size() << std::endl;

    // Remove all fully subsumed words

    for(auto p = words.begin(); p != words.end(); ) {
        bool subsumed = false;
        for(auto i = words.begin(); i != words.end(); ++ i) {
            if(i == p) {
                continue;
            }
            if(i->find(*p) != std::string::npos) {
                subsumed = true;
                break;
            }
        }
        if(subsumed) {
            p = words.erase(p);
        } else {
            ++ p;
        }
    }

    std::cerr
        << "After subsuming checks: " << words.size()
        << std::endl;

    // Sort words longest-to-shortest (not necessary but doesn't hurt. Makes finding maxlen a tiny bit easier)
    std::sort(words.begin(), words.end(), [](const std::string &a, const std::string &b) {
        return a.size() > b.size();
    });

    std::size_t maxlen = words.front().size();

    // Repeatedly combine most-compatible words until there is only one left
    std::size_t bestPossible = maxlen - 1;
    while(words.size() > 1) {
        auto bestA = words.begin();
        auto bestB = -- words.end();
        std::size_t bestOverlap = 0;
        for(auto p = ++ words.begin(), e = words.end(); p != e; ++ p) {
            if(p->size() - 1 <= bestOverlap) {
                continue;
            }
            for(auto q = words.begin(); q != p; ++ q) {
                std::size_t overlap = calcOverlap(*p, *q, bestPossible, bestOverlap);
                if(overlap > bestOverlap) {
                    bestA = p;
                    bestB = q;
                    bestOverlap = overlap;
                }
                overlap = calcOverlap(*q, *p, bestPossible, bestOverlap);
                if(overlap > bestOverlap) {
                    bestA = q;
                    bestB = p;
                    bestOverlap = overlap;
                }
            }
            if(bestOverlap == bestPossible) {
                break;
            }
        }
        std::string newStr = std::move(*bestA);
        newStr.append(*bestB, bestOverlap, std::string::npos);

        if(bestA == -- words.end()) {
            words.pop_back();
            *bestB = std::move(words.back());
            words.pop_back();
        } else {
            *bestB = std::move(words.back());
            words.pop_back();
            *bestA = std::move(words.back());
            words.pop_back();
        }

        // Remove any words which are now in the result
        for(auto p = words.begin(); p != words.end(); ) {
            if(newStr.find(*p) != std::string::npos) {
                std::cerr << "Now subsumes: " << *p << std::endl;
                p = words.erase(p);
            } else {
                ++ p;
            }
        }

        std::cerr
            << "Words remaining: " << (words.size() + 1)
            << " Latest combination: (" << bestOverlap << ") " << newStr
            << std::endl;

        words.push_back(std::move(newStr));
        bestPossible = bestOverlap; // Merging existing words will never make longer merges possible
    }

    std::string result = words.front();

    std::cout
        << result
        << std::endl;
    std::cerr
        << "Word size: " << result.size()
        << std::endl;
    return 0;
}

Verification bash one-liner:

cat commonwords.txt | while read p; do grep "$p" merged.txt >/dev/null || echo "Not found: $p"; done

It also produced some pretty cool in-progress words. Here are some of my favourites:

  • polyesterday (synthetic nostalgia)
  • afghanistanbul (something [insert politician you dislike] would say)
  • togethernet (the friendly internet)
  • elephantom (a large ghost)
  • thundergroundwaterproof (as in "I don't know why they felt the need to make it thundergroundwaterproof, but it's making me nervous")

And:

  • codescribingo (maybe an upcoming challenge on this site?)

The final output is on pastebin here: http://pastebin.com/j3qYb65b


C++11, 38272 letters, proven optimal

This algorithm is guaranteed to provide a lower bound on the solution. In this case, it is able to achieve the lower bound and output an optimal 38272 letter solution. (This matches the solution found by Dave’s greedy algorithm. I was surprised and a little disappointed to discover that it’s optimal, but, there we are.)

It works by solving the minimum-cost flow problem on the network built as follows.

  • First, any words contained in other words are redundant; discard them.
  • For every word w, draw two nodes w_0 and w_1, where w_0 is a source with capacity 1 and w_1 is a sink with capacity 1.
  • For every (strict) prefix or suffix a of any word, draw a node a.
  • For every suffix a of w, draw an arc from w_0 to a with capacity 1 and cost 0.
  • For every prefix a of w, draw an arc from a to w_1 with capacity 1 and cost length(w) − length(a).

Any string of length n that contains every word can be converted into a flow on this network with cost at most n. Therefore, the minimum cost flow on this network is a lower bound on the length of the shortest such string.

If we’re lucky—and in this case we are—then after we redirect the flow coming into w_1 back out of w_0, we will find an optimal flow that has just one connected component and that passes through the node for the empty string. If so, it will contain an Eulerian circuit starting and ending there. Such an Eulerian circuit can be read back out as a string of optimal length.

If we weren’t lucky, add some extra arcs between the empty string and the shortest strings in the other connected components in order to ensure that an Eulerian circuit exists. The string would no longer necessarily be optimal in that case.

I use the LEMON library for its min-cost flow and Eulerian circuit algorithms. (This was my first time using this library, and I was impressed—I will definitely use it again for future graph algorithm needs.) LEMON comes with four different minimum-cost flow algorithms; you can try them here with --net, --cost, --cap, and --cycle (default).

The program runs in 0.5 seconds, producing this output string.

#include <iostream>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include <lemon/core.h>
#include <lemon/connectivity.h>
#include <lemon/euler.h>
#include <lemon/maps.h>
#include <lemon/list_graph.h>
#include <lemon/network_simplex.h>
#include <lemon/cost_scaling.h>
#include <lemon/capacity_scaling.h>
#include <lemon/cycle_canceling.h>

using namespace std;

typedef lemon::ListDigraph G;

struct Word {
    G::Node suffix, prefix;
    G::Node tour_node;
};

struct Edge {
    unordered_map<string, Word>::iterator w;
    G::Arc arc;
};

struct Affix {
    vector<Edge> suffix, prefix;
    G::Node node;
    G::Node tour_node;
};

template<class MCF>
bool solve(const G &net, const G::ArcMap<int> &lowerMap, const G::ArcMap<int> &upperMap, const G::ArcMap<int> &costMap, const G::NodeMap<int> &supplyMap, int &totalCost, G::ArcMap<int> &flowMap)
{
    MCF mcf(net);
    if (mcf.lowerMap(lowerMap).upperMap(upperMap).costMap(costMap).supplyMap(supplyMap).run() != mcf.OPTIMAL)
        return false;
    totalCost = mcf.totalCost();
    mcf.flowMap(flowMap);
    return true;
}

int main(int argc, char **argv)
{
    clog << "Reading dictionary from stdin" << endl;
    unordered_map<string, Affix> affixes;
    unordered_map<string, Word> words;
    unordered_set<string> subwords;
    G net, tour;
    G::ArcMap<int> lowerMap(net), upperMap(net), costMap(net);
    G::NodeMap<int> supplyMap(net);
    string new_word;
    while (getline(cin, new_word)) {
        if (subwords.find(new_word) != subwords.end())
            continue;
        for (auto i = new_word.begin(); i != new_word.end(); ++i) {
            for (auto j = new_word.end(); j != i; --j) {
                string s(i, j);
                words.erase(s);
                subwords.insert(s);
            }
        }
        words.emplace(new_word, Word());
    }
    for (auto w = words.begin(); w != words.end(); ++w) {
        w->second.suffix = net.addNode();
        supplyMap.set(w->second.suffix, 1);
        w->second.prefix = net.addNode();
        supplyMap.set(w->second.prefix, -1);
        for (auto i = w->first.begin(); ; ++i) {
            affixes.emplace(string(w->first.begin(), i), Affix()).first->second.prefix.push_back(Edge {w});
            affixes.emplace(string(i, w->first.end()), Affix()).first->second.suffix.push_back(Edge {w});
            if (i == w->first.end())
                break;
        }
        w->second.tour_node = tour.addNode();
    }
    for (auto a = affixes.begin(); a != affixes.end();) {
        if (a->second.suffix.empty() || a->second.prefix.empty() ||
            (a->second.suffix.size() == 1 && a->second.prefix.size() == 1 &&
             a->second.suffix.begin()->w == a->second.prefix.begin()->w)) {
            affixes.erase(a++);
        } else {
            a->second.node = net.addNode();
            supplyMap.set(a->second.node, 0);
            for (auto &e : a->second.suffix) {
                e.arc = net.addArc(e.w->second.suffix, a->second.node);
                lowerMap.set(e.arc, 0);
                upperMap.set(e.arc, 1);
                costMap.set(e.arc, 0);
            }
            for (auto &e : a->second.prefix) {
                e.arc = net.addArc(a->second.node, e.w->second.prefix);
                lowerMap.set(e.arc, 0);
                upperMap.set(e.arc, 1);
                costMap.set(e.arc, e.w->first.length() - a->first.length());
            }
            a->second.tour_node = lemon::INVALID;
            ++a;
        }
    }

    clog << "Read " << words.size() << " words and found " << affixes.size() << " affixes; ";
    clog << "created network with " << countNodes(net) << " nodes and " << countArcs(net) << " arcs" << endl;

    int totalCost;
    G::ArcMap<int> flowMap(net);
    bool solved;
    if (argc > 1 && string(argv[1]) == "--net") {
        clog << "Using network simplex algorithm" << endl;
        solved = solve<lemon::NetworkSimplex<G>>(net, lowerMap, upperMap, costMap, supplyMap, totalCost, flowMap);
    } else if (argc > 1 && string(argv[1]) == "--cost") {
        clog << "Using cost scaling algorithm" << endl;
        solved = solve<lemon::CostScaling<G>>(net, lowerMap, upperMap, costMap, supplyMap, totalCost, flowMap);
    } else if (argc > 1 && string(argv[1]) == "--cap") {
        clog << "Using capacity scaling algorithm" << endl;
        solved = solve<lemon::CapacityScaling<G>>(net, lowerMap, upperMap, costMap, supplyMap, totalCost, flowMap);
    } else if ((argc > 1 && string(argv[1]) == "--cycle") || true) {
        clog << "Using cycle canceling algorithm" << endl;
        solved = solve<lemon::CycleCanceling<G>>(net, lowerMap, upperMap, costMap, supplyMap, totalCost, flowMap);
    }

    if (!solved) {
        clog << "error: no solution found" << endl;
        return 1;
    }
    clog << "Lower bound: " << totalCost << endl;

    G::ArcMap<string> arcLabel(tour);
    G::Node empty = tour.addNode();
    affixes.find("")->second.tour_node = empty;
    for (auto &a : affixes) {
        for (auto &e : a.second.suffix) {
            if (flowMap[e.arc]) {
                if (a.second.tour_node == lemon::INVALID)
                    a.second.tour_node = tour.addNode();
                arcLabel.set(tour.addArc(e.w->second.tour_node, a.second.tour_node), "");
            }
        }
        for (auto &e : a.second.prefix) {
            if (flowMap[e.arc]) {
                if (a.second.tour_node == lemon::INVALID)
                    a.second.tour_node = tour.addNode();
                arcLabel.set(tour.addArc(a.second.tour_node, e.w->second.tour_node), e.w->first.substr(a.first.length()));
            }
        }
    }

    clog << "Created tour graph with " << countNodes(tour) << " nodes and " << countArcs(tour) << " arcs" << endl;

    G::NodeMap<int> compMap(tour);
    int components = lemon::stronglyConnectedComponents(tour, compMap);
    if (components != 1) {
        vector<unordered_map<string, Affix>::iterator> breaks(components, affixes.end());
        for (auto a = affixes.begin(); a != affixes.end(); ++a) {
            if (a->second.tour_node == lemon::INVALID)
                continue;
            int c = compMap[a->second.tour_node];
            if (c == compMap[empty])
                continue;
            auto &b = breaks[compMap[a->second.tour_node]];
            if (b == affixes.end() || b->first.length() > a->first.length())
                b = a;
        }
        int offset = 0;
        for (auto &b : breaks) {
            if (b != affixes.end()) {
                arcLabel.set(tour.addArc(empty, b->second.tour_node), b->first);
                arcLabel.set(tour.addArc(b->second.tour_node, empty), "");
                offset += b->first.length();
            }
        }
        clog << "warning: Found " << components << " components; solution may be suboptimal by up to " << offset << " letters" << endl;
    }

    if (!lemon::eulerian(tour)) {
        clog << "error: failed to make tour graph Eulerian" << endl;
        return 1;
    }

    for (lemon::DiEulerIt<G> e(tour, empty); e != lemon::INVALID; ++e)
        cout << arcLabel[e];
    cout << endl;

    return 0;
}

Java 8, ~5 Minutes, Length of 39,279

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;

public class Words {

    public static void main(String[] args) throws Throwable {
        File file = new File("words.txt");
        List<String> wordsList = new ArrayList<>();
        BufferedReader reader = new BufferedReader(new FileReader(file));
        String line;
        while ((line = reader.readLine()) != null) {
            wordsList.add(line);
        }
        reader.close();

        Set<String> words = new HashSet<>();

        System.out.println("Finished I/O");

        for (int i = 0; i < wordsList.size(); i++) { //Step 1: remove any words that occur in other words
            boolean in = false;
            for (int j = 0; j < wordsList.size(); j++) {
                if (i != j && wordsList.get(j).contains(wordsList.get(i))) {
                    in = true;
                    break;
                }
            }
            if (!in) {
                words.add(wordsList.get(i));
            }
        }

        System.out.println("Removed direct containers");

        List<String> working = words.stream().sorted((c, b) -> Integer.compare(c.length(), b.length())).collect(Collectors.toList()); //Sort by length, shortest first
        StringBuilder result = new StringBuilder();
        result.append(working.get(0));
        while (!working.isEmpty()) {
            Optional<String> target = working.stream().sorted((c, b) -> Integer.compare(firstLastCommonality(result.toString(), b), firstLastCommonality(result.toString(), c))).findFirst(); //Find the string that has the greatest in common with the end of 'result'
            if(target.isPresent()) { //It really should be present, but just in case
                String s = target.get();
                working.remove(s);
                int commonality = firstLastCommonality(result.toString(), s);
                s = s.substring(commonality);
                result.append(s);
            }
        }

        System.out.println("Finished algorithm");

        String r = result.toString();
        System.out.println("The string: \n" + r);
        System.out.println("Length: \n" + r.length());
        System.out.println("Verified: \n" + !wordsList.stream().filter(s -> !r.contains(s)).findFirst().isPresent());
    }

    private static int firstLastCommonality(String a, String b) {
        int count = 0;
        int len = b.length();
        while (!a.endsWith(b) && !b.equals("")) {
            b = cutLastChar(b);
            count++;
        }
        return len - count;
    }

    private static String cutLastChar(String string) {
        if (string.length() - 1 < 0) {
            return string;
        } else {
            return string.substring(0, string.length() - 1);
        }
    }

}

Input:

  • a file called 'words.txt' in the working directory, in the exact same format as the file in the main post

Output:

Finished I/O
Removed direct containers
Finished algorithm
The string: 
[Moved to pastebin](http://pastebin.com/iygyR3zL)
Length: 
39279
Verified: 
true