CartoDB · jgoizueta · Nov 10, 2016 · Oct 27, 2016 · Oct 27, 2016 · Oct 27, 2016
diff --git a/lib/analysis.js b/lib/analysis.js
@@ -9,6 +9,7 @@ var toposort = require('../lib/dag/toposort');
 var validator = require('../lib/dag/validator');
 
 var DatabaseService = require('./service/database');
+var Requirements = require('./service/requirements');
 
 var AnalysisLogger = require('./logging/logger');
 
@@ -60,6 +61,7 @@ AnalysisFactory.prototype.create = function(configuration, definition, callback)
         configuration.batch,
         configuration.limits
     );
+    var requirements = new Requirements(databaseService, configuration.limits);
     var logger = configuration.logger ? new AnalysisLogger(configuration.logger.stream, configuration.user) : undefined;
 
     async.waterfall(
@@ -80,6 +82,16 @@ AnalysisFactory.prototype.create = function(configuration, definition, callback)
                     return done(err, analysis);
                 });
             },
+            function analysis$collectRequirements(analysis, done) {
+                requirements.computeRequirements(analysis, function(err) {
+                    return done(err, analysis);
+                });
+            },
+            function analysis$validateRequirements(analysis, done) {
+                requirements.validateRequirements(analysis, function(err) {
+                    return done(err, analysis);
+                });
+            },
             function analysis$queueOperations(analysis, done) {
                 databaseService.queueAnalysisOperations(analysis, function(err) {
                     return done(err, analysis);
@@ -100,10 +112,8 @@ AnalysisFactory.prototype.create = function(configuration, definition, callback)
                 if (err && err.message && err.message.match(/permission denied/i)) {
                     err = new Error('Analysis requires authentication with API key: permission denied.');
                 }
-
                 return callback(err);
             }
-
             return callback(null, analysis);
         }
     );

diff --git a/lib/node/node.js b/lib/node/node.js
@@ -468,3 +468,56 @@ function validate(validator, params, expectedParamName) {
 
     return param;
 }
+
+Node.prototype.computeRequirements = function(databaseService, limits, callback) {
+    // By default simply compute maximum of the inputs' number of rows.
+    // TODO: if the most common multi-input analysis is some kind of join we should use
+    // the product of the input numberOfRows instead
+    var maxRows = Math.max.apply(
+        null,
+        this.inputNodes.map(function(node) { return node.estimatedRequirements.numberOfRows || 0; })
+    );
+    if (maxRows < 0) {
+        maxRows = 0;
+    }
+    this.estimatedRequirements = {
+        numberOfRows: maxRows
+    };
+    this.limits = {
+        maximumNumberOfRows: getNodeLimit(limits, this.getType(), 'maximumNumberOfRows', 1000000)
+    };
+    return callback(null, this.requirementMessages());
+};
+
+Node.prototype.requirementMessages = function() {
+    var messages = [];
+    if (this.estimatedRequirements.numberOfRows > this.limits.maximumNumberOfRows) {
+        messages.push('too many result rows');
+    }
+    return messages;
+};
+
+Node.prototype.validateRequirements = function(callback) {
+    var messages = this.requirementMessages();
+    var err;
+    if (messages.length > 0) {
+        this.status = STATUS.FAILED;
+        this.errorMessage = messages.join('\n');
+        err = new Error(this.errorMessage);
+    }
+    callback(err);
+};
+
+function getNodeLimit(globalLimits, nodeType, limitName, defaultValue) {
+    var limit = null;
+    var limits = globalLimits.analyses;
+    if (limits) {
+        if (limits[nodeType] !== undefined) {
+            limits = limits[nodeType];
+        }
+        limit = limits[limitName];
+    }
+    return limit || defaultValue;
+}
+
+module.exports.getNodeLimit = getNodeLimit;
diff --git a/lib/node/nodes/aggregate-intersection.js b/lib/node/nodes/aggregate-intersection.js
@@ -77,3 +77,16 @@ var queryAggregateTemplate = Node.template([
     'WHERE ST_Intersects(_cdb_analysis_source.the_geom, _cdb_analysis_target.the_geom)',
     'GROUP BY {{=it.groupByColumns}}'
 ].join('\n'));
+
+AggregateIntersection.prototype.computeRequirements = function(databaseService, limits, callback) {
+    // we estimate the maximum possible number of rows of the result
+    var product = this.source.estimatedRequirements.numberOfRows *
+                  this.target.estimatedRequirements.numberOfRows;
+    this.estimatedRequirements = {
+        numberOfRows: product
+    };
+    this.limits = {
+        maximumNumberOfRows: Node.getNodeLimit(limits, TYPE, 'maximumNumberOfRows', 1000000)
+    };
+    return callback(null);
+};
diff --git a/lib/node/nodes/filter-category.js b/lib/node/nodes/filter-category.js
@@ -25,3 +25,15 @@ module.exports = FilterCategory;
 FilterCategory.prototype.sql = function() {
     return this.category.sql(this.source.getQuery());
 };
+
+FilterCategory.prototype.computeRequirements = function(databaseService, limits, callback) {
+    // We use a very simplistic approach: estimate as many rows as the unfiltered source
+    // (the actual value is always equal or less to that)
+    this.estimatedRequirements = {
+        numberOfRows: this.source.estimatedRequirements.numberOfRows
+    };
+    this.limits = {
+        maximumNumberOfRows: Node.getNodeLimit(limits, TYPE, 'maximumNumberOfRows', 1000000)
+    };
+    return callback(null);
+};
diff --git a/lib/node/nodes/source.js b/lib/node/nodes/source.js
@@ -30,5 +30,26 @@ Source.prototype.sql = function() {
  * @returns {Node.STATUS}
  */
 Source.prototype.getStatus = function() {
-    return Node.STATUS.READY;
+    return Node.STATUS.READY; // TODO: this ignores the possibility of requirements exceeding the limits
+};
+
+var estimatedCountTemplate = Node.template('EXPLAIN (FORMAT JSON) {{=it.sourceQuery}}');
+
+Source.prototype.computeRequirements = function(databaseService, limits, callback) {
+    var sql = estimatedCountTemplate({
+        sourceQuery: this.query
+    });
+    var self = this;
+    databaseService.run(sql, function(err, resultSet){
+        if (err) {
+            return callback(err);
+        }
+        self.estimatedRequirements = {
+            numberOfRows: resultSet.rows[0]['QUERY PLAN'][0].Plan['Plan Rows']
+        };
+        self.limits = {
+            maximumNumberOfRows: Node.getNodeLimit(limits, TYPE, 'maximumNumberOfRows', 1000000)
+        };
+        return callback(null);
+    });
 };
diff --git a/lib/service/requirements.js b/lib/service/requirements.js
@@ -0,0 +1,107 @@
+'use strict';
+
+var async = require('async');
+var Node = require('../node/node');
+var debug = require('../util/debug')('requirements');
+
+var QUERY_RUNNER_READONLY_OP = true;
+var QUERY_RUNNER_WRITE_OP = !QUERY_RUNNER_READONLY_OP;
+
+// A priori checking of the requirements/limits of an analysis
+function Requirements(databaseService, limits) {
+    this.databaseService = databaseService;
+    this.limits = limits;
+}
+
+// TODO: consider doing computation & validation in one single process
+Requirements.prototype.computeRequirements = function (analysis, callback) {
+    var sortedNodes = analysis.getSortedNodes();
+    var allNodes = analysis.getNodes();
+    var aliasedNodesPresent = allNodes.length > sortedNodes.length;
+    var self = this;
+    async.eachSeries(
+        sortedNodes,
+        function(node, done) {
+           node.computeRequirements(self.databaseService, self.limits, function(err) {
+               if (aliasedNodesPresent) {
+                   // some nodes are aliased (multiple nodes with the same id);
+                   // we need to replicate the requirements and limits to them, because
+                   // another node later in the sequence may try to access them
+                   replicateRequirementsToAliases(node, allNodes);
+               }
+               return done(err);
+           });
+        },
+        function finish(err) {
+            if (err) {
+                return callback(err);
+            }
+            return callback(null);
+        }
+    );
+};
+
+// Validates analysis requirements, node by node individually; as soon as
+// a node fails to pass the requirements this is aborted, the node status
+// and error message stored in the cataglo, and the error is returned to
+// the callback.
+Requirements.prototype.validateRequirements = function (analysis, callback) {
+    var self = this;
+    async.eachSeries(
+        analysis.getSortedNodes(),
+        function(node, done) {
+           node.validateRequirements(function(err) {
+             if (err) {
+                 // register the failed status
+                 var sql = updateNodeAsFailedAtAnalysisCatalogQuery([node.id()], err.message);
+                 self.databaseService.queryRunner.run(sql, QUERY_RUNNER_WRITE_OP, function(sql_err) {
+                     if (sql_err) {
+                         // FiXME: what should we do if saving the status fails?
+                         debug('SQL ERROR:', sql_err);
+                     }
+                     return done(err);
+                 });
+             } else {
+               return done(err);
+             }
+           });
+        },
+        callback
+    );
+};
+
+module.exports = Requirements;
+
+function replicateRequirementsToAliases(node, allNodes) {
+    var id = node.id();
+    allNodes.forEach(function(otherNode) {
+        if (otherNode.id() === id && !otherNode.estimatedRequirements) {
+            otherNode.estimatedRequirements = node.estimatedRequirements;
+            otherNode.limits = node.limits;
+        }
+    });
+}
+
+function pgQuoteCastMapper(cast) {
+    return function(input) {
+        return '\'' + input + '\'' + (cast ? ('::' + cast) : '');
+    };
+}
+
+function updateNodeAtAnalysisCatalogQuery(nodeIds, columns) {
+    nodeIds = Array.isArray(nodeIds) ? nodeIds : [nodeIds];
+    return [
+        'UPDATE cdb_analysis_catalog SET',
+        columns.join(','),
+        'WHERE node_id IN (' + nodeIds.map(pgQuoteCastMapper()).join(', ') + ')'
+    ].join('\n');
+}
+
+function updateNodeAsFailedAtAnalysisCatalogQuery(nodeIds, errorMessage) {
+    var status = Node.STATUS.FAILED;
+    return updateNodeAtAnalysisCatalogQuery(nodeIds, [
+        'status = \'' + status + '\'',
+        'last_error_message = $last_error_message$' + errorMessage + '$last_error_message$',
+        'updated_at = NOW()'
+    ]);
+}
diff --git a/test/fixtures/table/postal_codes.sql b/test/fixtures/table/postal_codes.sql
@@ -0,0 +1,13 @@
+CREATE TABLE postal_codes (
+    cartodb_id integer NOT NULL,
+    the_geom geometry(Geometry,4326),
+    the_geom_webmercator geometry(Geometry,3857),
+    code text
+);
+
+ALTER TABLE ONLY postal_codes
+    ADD CONSTRAINT postal_codes_pkey PRIMARY KEY (cartodb_id);
+
+CREATE INDEX postal_codes_the_geom_idx ON postal_codes USING gist (the_geom);
+
+CREATE INDEX postal_codes_the_geom_webmercator_idx ON postal_codes USING gist (the_geom_webmercator);
diff --git a/test/integration/analysis.js b/test/integration/analysis.js
@@ -102,6 +102,72 @@ describe('workflow', function() {
             });
         });
 
+        it('should compute node requirements and limits for source', function(done) {
+            Analysis.create(testConfig, sourceAnalysisDefinition, function(err, analysis) {
+                assert.ok(!err, err);
+                assert.equal(analysis.getRoot().estimatedRequirements.numberOfRows, 6);
+                assert.equal(analysis.getRoot().limits.maximumNumberOfRows, 1000000);
+                done();
+            });
+        });
+
+        it('should abort analysis over the limits for source', function(done) {
+            var limitedConfig = testConfig.create({
+                limits: {
+                    analyses: {
+                        source: {
+                            maximumNumberOfRows: 5
+                        }
+                    }
+                }
+            });
+            Analysis.create(limitedConfig, sourceAnalysisDefinition, function(err) {
+                assert.ok(err);
+                done();
+            });
+        });
+
+        it('should compute node requirements and limits for trade areas', function(done) {
+            var enqueueFn = BatchClient.prototype.enqueue;
+
+            BatchClient.prototype.enqueue = function(query, callback) {
+                return callback(null, {status: 'ok'});
+            };
+
+            Analysis.create(testConfig, tradeAreaAnalysisDefinition, function(err, analysis) {
+                BatchClient.prototype.enqueue = enqueueFn;
+
+                assert.ok(!err, err);
+                assert.equal(analysis.getRoot().estimatedRequirements.numberOfRows, 6);
+                assert.equal(analysis.getRoot().limits.maximumNumberOfRows, 1000000);
+                done();
+            });
+        });
+
+        it('should abort analysis over the limits for trade areas', function(done) {
+            var limitedConfig = testConfig.create({
+                limits: {
+                    analyses: {
+                        'trade-area': {
+                            maximumNumberOfRows: 5
+                        }
+                    }
+                }
+            });
+
+            var enqueueFn = BatchClient.prototype.enqueue;
+            BatchClient.prototype.enqueue = function(query, callback) {
+                return callback(null, {status: 'ok'});
+            };
+
+            Analysis.create(limitedConfig, tradeAreaAnalysisDefinition, function(err) {
+                BatchClient.prototype.enqueue = enqueueFn;
+
+                assert.ok(err);
+                done();
+            });
+        });
+
     });
 
 });
diff --git a/test/setup.js b/test/setup.js
@@ -33,7 +33,8 @@ before(function setupTestDatabase(done) {
 
         fs.realpathSync('./test/fixtures/table/madrid_districts.sql'),
         fs.realpathSync('./test/fixtures/table/atm_machines.sql'),
-        fs.realpathSync('./test/fixtures/table/airbnb_rooms.sql')
+        fs.realpathSync('./test/fixtures/table/airbnb_rooms.sql'),
+        fs.realpathSync('./test/fixtures/table/postal_codes.sql')
     ];
 
     async.waterfall(