Save a very big CSV to mongoDB using mongoose

Welcome to streaming. What you really want is an "evented stream" that processes your input "one chunk at a time", and of course ideally by a common delimiter such as the "newline" character you are currently using.

For really efficient stuff, you can add usage of MongoDB "Bulk API" inserts to make your loading as fast as possible without eating up all of the machine memory or CPU cycles.

Not advocating as there are various solutions available, but here is a listing that utilizes the line-input-stream package to make the "line terminator" part simple.

Schema definitions by "example" only:

var LineInputStream = require("line-input-stream"),
    fs = require("fs"),
    async = require("async"),
    mongoose = require("mongoose"),
    Schema = mongoose.Schema;

var entrySchema = new Schema({},{ strict: false })

var Entry = mongoose.model( "Schema", entrySchema );

var stream = LineInputStream(fs.createReadStream("data.txt",{ flags: "r" }));

stream.setDelimiter("\n");

mongoose.connection.on("open",function(err,conn) { 

    // lower level method, needs connection
    var bulk = Entry.collection.initializeOrderedBulkOp();
    var counter = 0;

    stream.on("error",function(err) {
        console.log(err); // or otherwise deal with it
    });

    stream.on("line",function(line) {

        async.series(
            [
                function(callback) {
                    var row = line.split(",");     // split the lines on delimiter
                    var obj = {};             
                    // other manipulation

                    bulk.insert(obj);  // Bulk is okay if you don't need schema
                                       // defaults. Or can just set them.

                    counter++;

                    if ( counter % 1000 == 0 ) {
                        stream.pause();
                        bulk.execute(function(err,result) {
                            if (err) callback(err);
                            // possibly do something with result
                            bulk = Entry.collection.initializeOrderedBulkOp();
                            stream.resume();
                            callback();
                        });
                    } else {
                        callback();
                    }
               }
           ],
           function (err) {
               // each iteration is done
           }
       );

    });

    stream.on("end",function() {

        if ( counter % 1000 != 0 )
            bulk.execute(function(err,result) {
                if (err) throw err;   // or something
                // maybe look at result
            });
    });

});

So generally the "stream" interface there "breaks the input down" in order to process "one line at a time". That stops you from loading everything at once.

The main parts are the "Bulk Operations API" from MongoDB. This allows you to "queue up" many operations at a time before actually sending to the server. So in this case with the use of a "modulo", writes are only sent per 1000 entries processed. You can really do anything up to the 16MB BSON limit, but keep it manageable.

In addition to the operations being processed in bulk, there is an additional "limiter" in place from the async library. It's not really required, but this ensures that essentially no more than the "modulo limit" of documents are in process at any time. The general batch "inserts" come at no IO cost other than memory, but the "execute" calls mean IO is processing. So we wait rather than queuing up more things.

There are surely better solutions you can find for "stream processing" CSV type data which this appears to be. But in general this gives you the concepts to how to do this in a memory efficient manner without eating CPU cycles as well.

The accepted answer is great and attempted to cover all the important aspects of this problem.

Reading the CSV file as a stream of lines
Writing the documents in batches to MongoDB
Synchronization between reading and writing

While it did well with first two aspects, the approach taken to address the synchronization using async.series() won't work as expected.

stream.on("line",function(line) {
    async.series(
        [
            function(callback) {
                var row = line.split(",");     // split the lines on delimiter
                var obj = {};             
                // other manipulation

                bulk.insert(obj);  // Bulk is okay if you don't need schema
                                   // defaults. Or can just set them.

                counter++;

                if ( counter % 1000 == 0 ) {
                    bulk.execute(function(err,result) {
                        if (err) throw err;   // or do something
                        // possibly do something with result
                        bulk = Entry.collection.initializeOrderedBulkOp();
                        callback();
                    });
                } else {
                    callback();
                }
           }
       ],
       function (err) {
           // each iteration is done
       }
   );
});

Here bulk.execute() is a mongodb write operation and its an asynchronous IO call. This allows node.js to proceed with the event loop before bulk.execute() is done with its db writes and calls back.

So it may go on to receive more 'line' events from the stream and queue more documents bulk.insert(obj) and can hit next modulo to trigger bulk.execute() again.

Lets have a look at this example.

var async = require('async');

var bulk = {
    execute: function(callback) {
        setTimeout(callback, 1000);
    }
};

async.series(
    [
       function (callback) {
           bulk.execute(function() {
              console.log('completed bulk.execute');
              callback(); 
           });
       },
    ], 
    function(err) {

    }
);

console.log("!!! proceeding to read more from stream");

It's output

!!! proceeding to read more from stream
completed bulk.execute

To really ensure that we are processing one batch of N documents at any given time, we need to enforce flow control on the file stream using stream.pause() & stream.resume()

var LineInputStream = require("line-input-stream"),
    fs = require("fs"),
    mongoose = require("mongoose"),
    Schema = mongoose.Schema;

var entrySchema = new Schema({},{ strict: false });
var Entry = mongoose.model( "Entry", entrySchema );

var stream = LineInputStream(fs.createReadStream("data.txt",{ flags: "r" }));

stream.setDelimiter("\n");

mongoose.connection.on("open",function(err,conn) { 

    // lower level method, needs connection
    var bulk = Entry.collection.initializeOrderedBulkOp();
    var counter = 0;

    stream.on("error",function(err) {
        console.log(err); // or otherwise deal with it
    });

    stream.on("line",function(line) {
        var row = line.split(",");     // split the lines on delimiter
        var obj = {};             
        // other manipulation

        bulk.insert(obj);  // Bulk is okay if you don't need schema
                           // defaults. Or can just set them.

        counter++;

        if ( counter % 1000 === 0 ) {
            stream.pause(); //lets stop reading from file until we finish writing this batch to db

            bulk.execute(function(err,result) {
                if (err) throw err;   // or do something
                // possibly do something with result
                bulk = Entry.collection.initializeOrderedBulkOp();

                stream.resume(); //continue to read from file
            });
        }
    });

    stream.on("end",function() {
        if ( counter % 1000 != 0 ) {
            bulk.execute(function(err,result) {
                if (err) throw err;   // or something
                // maybe look at result
            });
        }
    });

});

Save a very big CSV to mongoDB using mongoose

Tags:

Javascript

Mongodb

Node.Js

Mongoose

Related

Recent Posts