-
Notifications
You must be signed in to change notification settings - Fork 7
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add mechanism to check analysis before it's executed #215
Changes from 14 commits
17925d1
ba97e6e
023a5d5
a6aae8d
74b1fad
6fc38a4
f38729e
9808d6c
a2e4520
f91a33d
86961cf
b5d2bbd
880fa16
3b40450
700a116
fa211d9
e36669e
6d6d71c
3d26bfe
e2e6e59
baf428c
722de54
1309a75
27782e6
4d97ee2
e88b88e
aa12fef
e931566
b247f61
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -468,3 +468,56 @@ function validate(validator, params, expectedParamName) { | |
|
||
return param; | ||
} | ||
|
||
Node.prototype.computeRequirements = function(databaseService, limits, callback) { | ||
// By default simply compute maximum of the inputs' number of rows. | ||
// TODO: if the most common multi-input analysis is some kind of join we should use | ||
// the product of the input numberOfRows instead | ||
var maxRows = Math.max.apply( | ||
null, | ||
this.inputNodes.map(function(node) { return node.estimatedRequirements.numberOfRows || 0; }) | ||
); | ||
if (maxRows < 0) { | ||
maxRows = 0; | ||
} | ||
this.estimatedRequirements = { | ||
numberOfRows: maxRows | ||
}; | ||
this.limits = { | ||
maximumNumberOfRows: getNodeLimit(limits, this.getType(), 'maximumNumberOfRows', 1000000) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since limit values should be definable in Redis, maybe we should use snake_case instead of camelCase? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you mean There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, having |
||
}; | ||
return callback(null, this.requirementMessages()); | ||
}; | ||
|
||
Node.prototype.requirementMessages = function() { | ||
var messages = []; | ||
if (this.estimatedRequirements.numberOfRows > this.limits.maximumNumberOfRows) { | ||
messages.push('too many result rows'); | ||
} | ||
return messages; | ||
}; | ||
|
||
Node.prototype.validateRequirements = function(callback) { | ||
var messages = this.requirementMessages(); | ||
var err; | ||
if (messages.length > 0) { | ||
this.status = STATUS.FAILED; | ||
this.errorMessage = messages.join('\n'); | ||
err = new Error(this.errorMessage); | ||
} | ||
callback(err); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We must avoid to use callbacks in synchronous functions. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was making that function callable in an asynchronous way in case we needed to specialise it in cases in which limits (such as quotas) need to be obtain asyncrhonously. But since we don't need async calls now and we'll change this design anyway I'll remove the callback. |
||
}; | ||
|
||
function getNodeLimit(globalLimits, nodeType, limitName, defaultValue) { | ||
var limit = null; | ||
var limits = globalLimits.analyses; | ||
if (limits) { | ||
if (limits[nodeType] !== undefined) { | ||
limits = limits[nodeType]; | ||
} | ||
limit = limits[limitName]; | ||
} | ||
return limit || defaultValue; | ||
} | ||
|
||
module.exports.getNodeLimit = getNodeLimit; |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -77,3 +77,16 @@ var queryAggregateTemplate = Node.template([ | |
'WHERE ST_Intersects(_cdb_analysis_source.the_geom, _cdb_analysis_target.the_geom)', | ||
'GROUP BY {{=it.groupByColumns}}' | ||
].join('\n')); | ||
|
||
AggregateIntersection.prototype.computeRequirements = function(databaseService, limits, callback) { | ||
// we estimate the maximum possible number of rows of the result | ||
var product = this.source.estimatedRequirements.numberOfRows * | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess this is not consider good practice, should I define an accessor and use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We have several places where we don't follow |
||
this.target.estimatedRequirements.numberOfRows; | ||
this.estimatedRequirements = { | ||
numberOfRows: product | ||
}; | ||
this.limits = { | ||
maximumNumberOfRows: Node.getNodeLimit(limits, TYPE, 'maximumNumberOfRows', 1000000) | ||
}; | ||
return callback(null); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here, not use callbacks with synchronous code. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this case we do need the callback parameter, so we have a common interface for all nodes, because some node classes need to perform asynchronous operations in the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, but be careful with CPU intensive tasks and consider use |
||
}; |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,3 +25,15 @@ module.exports = FilterCategory; | |
FilterCategory.prototype.sql = function() { | ||
return this.category.sql(this.source.getQuery()); | ||
}; | ||
|
||
FilterCategory.prototype.computeRequirements = function(databaseService, limits, callback) { | ||
// We use a very simplistic approach: estimate as many rows as the unfiltered source | ||
// (the actual value is always equal or less to that) | ||
this.estimatedRequirements = { | ||
numberOfRows: this.source.estimatedRequirements.numberOfRows | ||
}; | ||
this.limits = { | ||
maximumNumberOfRows: Node.getNodeLimit(limits, TYPE, 'maximumNumberOfRows', 1000000) | ||
}; | ||
return callback(null); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here. |
||
}; |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
'use strict'; | ||
|
||
var async = require('async'); | ||
var Node = require('../node/node'); | ||
var debug = require('../util/debug')('requirements'); | ||
|
||
var QUERY_RUNNER_READONLY_OP = true; | ||
var QUERY_RUNNER_WRITE_OP = !QUERY_RUNNER_READONLY_OP; | ||
|
||
// A priori checking of the requirements/limits of an analysis | ||
function Requirements(databaseService, limits) { | ||
this.databaseService = databaseService; | ||
this.limits = limits; | ||
} | ||
|
||
// TODO: consider doing computation & validation in one single process | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd implement computation & validation for each node in one single process. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sure! |
||
Requirements.prototype.computeRequirements = function (analysis, callback) { | ||
var sortedNodes = analysis.getSortedNodes(); | ||
var allNodes = analysis.getNodes(); | ||
var aliasedNodesPresent = allNodes.length > sortedNodes.length; | ||
var self = this; | ||
async.eachSeries( | ||
sortedNodes, | ||
function(node, done) { | ||
node.computeRequirements(self.databaseService, self.limits, function(err) { | ||
if (aliasedNodesPresent) { | ||
// some nodes are aliased (multiple nodes with the same id); | ||
// we need to replicate the requirements and limits to them, because | ||
// another node later in the sequence may try to access them | ||
replicateRequirementsToAliases(node, allNodes); | ||
} | ||
return done(err); | ||
}); | ||
}, | ||
function finish(err) { | ||
if (err) { | ||
return callback(err); | ||
} | ||
return callback(null); | ||
} | ||
); | ||
}; | ||
|
||
// Validates analysis requirements, node by node individually; as soon as | ||
// a node fails to pass the requirements this is aborted, the node status | ||
// and error message stored in the cataglo, and the error is returned to | ||
// the callback. | ||
Requirements.prototype.validateRequirements = function (analysis, callback) { | ||
var self = this; | ||
async.eachSeries( | ||
analysis.getSortedNodes(), | ||
function(node, done) { | ||
node.validateRequirements(function(err) { | ||
if (err) { | ||
// register the failed status | ||
var sql = updateNodeAsFailedAtAnalysisCatalogQuery([node.id()], err.message); | ||
self.databaseService.queryRunner.run(sql, QUERY_RUNNER_WRITE_OP, function(sql_err) { | ||
if (sql_err) { | ||
// FiXME: what should we do if saving the status fails? | ||
debug('SQL ERROR:', sql_err); | ||
} | ||
return done(err); | ||
}); | ||
} else { | ||
return done(err); | ||
} | ||
}); | ||
}, | ||
callback | ||
); | ||
}; | ||
|
||
module.exports = Requirements; | ||
|
||
function replicateRequirementsToAliases(node, allNodes) { | ||
var id = node.id(); | ||
allNodes.forEach(function(otherNode) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Uhmm! 🤔 we need replicate in more than one node. Sorry, I didn't see it. |
||
if (otherNode.id() === id && !otherNode.estimatedRequirements) { | ||
otherNode.estimatedRequirements = node.estimatedRequirements; | ||
otherNode.limits = node.limits; | ||
} | ||
}); | ||
} | ||
|
||
function pgQuoteCastMapper(cast) { | ||
return function(input) { | ||
return '\'' + input + '\'' + (cast ? ('::' + cast) : ''); | ||
}; | ||
} | ||
|
||
function updateNodeAtAnalysisCatalogQuery(nodeIds, columns) { | ||
nodeIds = Array.isArray(nodeIds) ? nodeIds : [nodeIds]; | ||
return [ | ||
'UPDATE cdb_analysis_catalog SET', | ||
columns.join(','), | ||
'WHERE node_id IN (' + nodeIds.map(pgQuoteCastMapper()).join(', ') + ')' | ||
].join('\n'); | ||
} | ||
|
||
function updateNodeAsFailedAtAnalysisCatalogQuery(nodeIds, errorMessage) { | ||
var status = Node.STATUS.FAILED; | ||
return updateNodeAtAnalysisCatalogQuery(nodeIds, [ | ||
'status = \'' + status + '\'', | ||
'last_error_message = $last_error_message$' + errorMessage + '$last_error_message$', | ||
'updated_at = NOW()' | ||
]); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
CREATE TABLE postal_codes ( | ||
cartodb_id integer NOT NULL, | ||
the_geom geometry(Geometry,4326), | ||
the_geom_webmercator geometry(Geometry,3857), | ||
code text | ||
); | ||
|
||
ALTER TABLE ONLY postal_codes | ||
ADD CONSTRAINT postal_codes_pkey PRIMARY KEY (cartodb_id); | ||
|
||
CREATE INDEX postal_codes_the_geom_idx ON postal_codes USING gist (the_geom); | ||
|
||
CREATE INDEX postal_codes_the_geom_webmercator_idx ON postal_codes USING gist (the_geom_webmercator); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would like take a deep think because I think we can do it during analysis creation or validation and probably before registering the analysis in catalog...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd check limits after validating analysis and before registering it. If some node reaches the limit the analysis should fail and it shouldn't be registered.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm changing all this so we get the requirements and validate in a single operations, and do it before registering, as we have talked about.