Skip to content

Commit

Permalink
Cleanup and optimize DB schema
Browse files Browse the repository at this point in the history
The database schema was lacking many integrity checks and indexes.
Correct it by first applying a migration that removes all old/stale
data, then creating those indexes.

The driving reason for this is the very slow performance of processing
(specially aggregation) on the new mezuro.org servers. It will hopefully
remove (or at least heavily improve) the superlinear slowdown when the
number of metrics rises, as observed in #207.

Additionally, remove the timestamp columns from kalibro_modules,
module_results and metric_results: they are not used in any way, and
there are millions of rows containing them. It's possible and probably
desirable to just look at the timestamps in the processing.
  • Loading branch information
danielkza committed Jul 19, 2016
1 parent 28b1a49 commit 4e72857
Show file tree
Hide file tree
Showing 12 changed files with 167 additions and 20 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.rdoc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ KalibroProcessor is the processing web service for Mezuro.

== Unreleased

* Optimize database structure by adding foreign keys and indexes where needed
* Introduce performance tests for Aggregator
* Update KolektiMetricfu

Expand Down
73 changes: 73 additions & 0 deletions db/migrate/20160715192850_clean_inconsistencies.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
class CleanInconsistencies < ActiveRecord::Migration
def self.up
# Unset project reference for repositories with non-existing projects
execute <<-SQL
UPDATE repositories AS r
SET project_id = NULL
WHERE project_id = 0 OR NOT EXISTS (
SELECT 1 FROM projects AS p WHERE p.id = r.project_id
)
SQL

# Delete processings with non-existing repositories
execute <<-SQL
DELETE FROM processings AS p
WHERE NOT EXISTS(
SELECT 1 FROM repositories AS r WHERE r.id = p.repository_id
)
SQL

# Delete process times with non-existing processings
execute <<-SQL
DELETE FROM process_times AS t
WHERE NOT EXISTS (
SELECT 1 FROM processings AS p WHERE p.id = t.processing_id
)
SQL

# Delete module results with non-existing processings
execute <<-SQL
DELETE FROM module_results AS m
WHERE NOT EXISTS (
SELECT 1 FROM processings AS p WHERE p.id = m.processing_id
)
SQL

# Delete kalibro modules with non-existing module results
execute <<-SQL
DELETE FROM kalibro_modules AS k
WHERE NOT EXISTS (
SELECT 1 FROM module_results AS m WHERE m.id = k.module_result_id
)
SQL

# Delete metric results with non-existing module results
execute <<-SQL
DELETE FROM metric_results AS met
WHERE NOT EXISTS (
SELECT 1 FROM module_results AS mod WHERE mod.id = met.module_result_id
)
SQL

# Delete duplicate metric_results. Group them by (module_result, metric_configuration),
# then delete all but the one with the highest ID
# The double wrapping on the inner query is necessary because window functions
# cannot be used in WHERE in PostgreSQL.
execute <<-SQL
DELETE FROM metric_results
WHERE id IN (
SELECT t.id FROM (
SELECT id, ROW_NUMBER() OVER (PARTITION BY module_result_id, metric_configuration_id, "type"
ORDER BY id DESC) AS rnum
FROM metric_results
WHERE "type" = 'TreeMetricResult'
) AS t
WHERE t.rnum > 1
)
SQL
end

def self.down
raise ActiveRecord::IrreversibleMigration
end
end
6 changes: 6 additions & 0 deletions db/migrate/20160715192900_add_indexes_to_kalibro_modules.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
class AddIndexesToKalibroModules < ActiveRecord::Migration
def change
add_foreign_key :kalibro_modules, :module_results, on_delete: :cascade
add_index :kalibro_modules, [:long_name, :granularity]
end
end
6 changes: 6 additions & 0 deletions db/migrate/20160715192901_add_indexes_to_module_results.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
class AddIndexesToModuleResults < ActiveRecord::Migration
def change
add_foreign_key :module_results, :module_results, column: 'parent_id'
add_foreign_key :module_results, :processings, on_delete: :cascade
end
end
11 changes: 11 additions & 0 deletions db/migrate/20160715192902_add_indexes_to_metric_results.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
class AddIndexesToMetricResults < ActiveRecord::Migration
def change
add_foreign_key :metric_results, :module_results, on_delete: :cascade
add_index :metric_results, :type
add_index :metric_results, :module_result_id
add_index :metric_results, :metric_configuration_id
add_index :metric_results, [:module_result_id, :metric_configuration_id],
unique: true, where: "type = 'TreeMetricResult'",
name: 'metric_results_module_res_metric_cfg_uniq_idx'
end
end
6 changes: 6 additions & 0 deletions db/migrate/20160715192903_add_indexes_to_processings.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
class AddIndexesToProcessings < ActiveRecord::Migration
def change
add_foreign_key :processings, :repositories
add_foreign_key :processings, :module_results, column: 'root_module_result_id'
end
end
5 changes: 5 additions & 0 deletions db/migrate/20160715192904_add_indexes_to_process_times.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
class AddIndexesToProcessTimes < ActiveRecord::Migration
def change
add_foreign_key :process_times, :processings, on_delete: :cascade
end
end
5 changes: 5 additions & 0 deletions db/migrate/20160715192905_add_indexes_to_repositories.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
class AddIndexesToRepositories < ActiveRecord::Migration
def change
add_foreign_key :repositories, :projects
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
class RemoveTimestampsFromModuleResults < ActiveRecord::Migration
def change
remove_column :module_results, :created_at, :string
remove_column :module_results, :updated_at, :string
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
class RemoveTimestampsFromMetricResults < ActiveRecord::Migration
def change
remove_column :metric_results, :created_at, :string
remove_column :metric_results, :updated_at, :string
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
class RemoveTimestampsFromKalibroModules < ActiveRecord::Migration
def change
remove_column :kalibro_modules, :created_at, :string
remove_column :kalibro_modules, :updated_at, :string
end
end
56 changes: 36 additions & 20 deletions db/schema.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#
# It's strongly recommended that you check this file into your version control system.

ActiveRecord::Schema.define(version: 20151002172231) do
ActiveRecord::Schema.define(version: 20160716015915) do

# These are extensions that must be enabled in order to support this database
enable_extension "plpgsql"
Expand All @@ -33,36 +33,38 @@
add_index "delayed_jobs", ["priority", "run_at"], name: "delayed_jobs_priority", using: :btree

create_table "kalibro_modules", force: :cascade do |t|
t.string "long_name", limit: 255
t.string "granularity", limit: 255
t.datetime "created_at"
t.datetime "updated_at"
t.integer "module_result_id"
t.string "long_name", limit: 255
t.string "granularity", limit: 255
t.integer "module_result_id"
end

add_index "kalibro_modules", ["long_name", "granularity"], name: "index_kalibro_modules_on_long_name_and_granularity", using: :btree
add_index "kalibro_modules", ["module_result_id"], name: "index_kalibro_modules_on_module_result_id", using: :btree

create_table "metric_results", force: :cascade do |t|
t.integer "module_result_id"
t.integer "metric_configuration_id"
t.float "value"
t.datetime "created_at"
t.datetime "updated_at"
t.string "type", default: "MetricResult", null: false
t.integer "line_number"
t.text "message"
t.integer "related_hotspot_metric_results_id"
t.integer "module_result_id"
t.integer "metric_configuration_id"
t.float "value"
t.string "type", default: "MetricResult", null: false
t.integer "line_number"
t.text "message"
t.integer "related_hotspot_metric_results_id"
end

add_index "metric_results", ["metric_configuration_id"], name: "index_metric_results_on_metric_configuration_id", using: :btree
add_index "metric_results", ["module_result_id", "metric_configuration_id"], name: "metric_results_module_res_metric_cfg_uniq_idx", unique: true, where: "((type)::text = 'TreeMetricResult'::text)", using: :btree
add_index "metric_results", ["module_result_id"], name: "index_metric_results_on_module_result_id", using: :btree
add_index "metric_results", ["related_hotspot_metric_results_id"], name: "index_metric_results_on_related_hotspot_metric_results_id", using: :btree
add_index "metric_results", ["type"], name: "index_metric_results_on_type", using: :btree

create_table "module_results", force: :cascade do |t|
t.float "grade"
t.integer "parent_id"
t.datetime "created_at"
t.datetime "updated_at"
t.integer "processing_id"
t.float "grade"
t.integer "parent_id"
t.integer "processing_id"
end

add_index "module_results", ["parent_id"], name: "index_module_results_on_parent_id", using: :btree
add_index "module_results", ["processing_id"], name: "index_module_results_on_processing_id", using: :btree

create_table "process_times", force: :cascade do |t|
t.string "state", limit: 255
Expand All @@ -72,6 +74,8 @@
t.float "time"
end

add_index "process_times", ["processing_id"], name: "index_process_times_on_processing_id", using: :btree

create_table "processings", force: :cascade do |t|
t.string "state", limit: 255
t.integer "repository_id"
Expand All @@ -81,6 +85,8 @@
t.text "error_message"
end

add_index "processings", ["repository_id"], name: "index_processings_on_repository_id", using: :btree

create_table "projects", force: :cascade do |t|
t.string "name", limit: 255
t.string "description", limit: 255
Expand All @@ -106,5 +112,15 @@
t.string "branch", default: "master", null: false
end

add_index "repositories", ["project_id"], name: "index_repositories_on_project_id", using: :btree

add_foreign_key "kalibro_modules", "module_results", on_delete: :cascade
add_foreign_key "metric_results", "module_results", on_delete: :cascade
add_foreign_key "metric_results", "related_hotspot_metric_results", column: "related_hotspot_metric_results_id"
add_foreign_key "module_results", "module_results", column: "parent_id"
add_foreign_key "module_results", "processings", on_delete: :cascade
add_foreign_key "process_times", "processings", on_delete: :cascade
add_foreign_key "processings", "module_results", column: "root_module_result_id"
add_foreign_key "processings", "repositories"
add_foreign_key "repositories", "projects"
end

0 comments on commit 4e72857

Please sign in to comment.