From 6e9cec83e6d382629f8e8dc5a0bb9f2ae376a6b6 Mon Sep 17 00:00:00 2001 From: Daniel Miranda Date: Fri, 15 Jul 2016 22:24:56 -0300 Subject: [PATCH] Cleanup and optimize DB schema The database schema was lacking many integrity checks and indexes. Correct it by first applying a migration that removes all old/stale data, then creating those indexes. The driving reason for this is the very slow performance of processing (specially aggregation) on the new mezuro.org servers. It will hopefully remove (or at least heavily improve) the superlinear slowdown when the number of metrics rises, as observed in #207. Additionally, remove the timestamp columns from kalibro_modules, module_results and metric_results: they are not used in any way, and there are millions of rows containing them. It's possible and probably desirable to just look at the timestamps in the processing. --- CHANGELOG.rdoc | 1 + .../20160715192850_clean_inconsistencies.rb | 71 +++++++++++++++++++ ...15192900_add_indexes_to_kalibro_modules.rb | 6 ++ ...715192901_add_indexes_to_module_results.rb | 6 ++ ...715192902_add_indexes_to_metric_results.rb | 11 +++ ...160715192903_add_indexes_to_processings.rb | 6 ++ ...0715192904_add_indexes_to_process_times.rb | 5 ++ ...60715192905_add_indexes_to_repositories.rb | 5 ++ ...1_remove_timestamps_from_module_results.rb | 6 ++ ...8_remove_timestamps_from_metric_results.rb | 6 ++ ..._remove_timestamps_from_kalibro_modules.rb | 6 ++ db/schema.rb | 56 +++++++++------ 12 files changed, 165 insertions(+), 20 deletions(-) create mode 100644 db/migrate/20160715192850_clean_inconsistencies.rb create mode 100644 db/migrate/20160715192900_add_indexes_to_kalibro_modules.rb create mode 100644 db/migrate/20160715192901_add_indexes_to_module_results.rb create mode 100644 db/migrate/20160715192902_add_indexes_to_metric_results.rb create mode 100644 db/migrate/20160715192903_add_indexes_to_processings.rb create mode 100644 db/migrate/20160715192904_add_indexes_to_process_times.rb create mode 100644 db/migrate/20160715192905_add_indexes_to_repositories.rb create mode 100644 db/migrate/20160716000521_remove_timestamps_from_module_results.rb create mode 100644 db/migrate/20160716000528_remove_timestamps_from_metric_results.rb create mode 100644 db/migrate/20160716000530_remove_timestamps_from_kalibro_modules.rb diff --git a/CHANGELOG.rdoc b/CHANGELOG.rdoc index 5823de5..c97f3d6 100644 --- a/CHANGELOG.rdoc +++ b/CHANGELOG.rdoc @@ -4,6 +4,7 @@ KalibroProcessor is the processing web service for Mezuro. == Unreleased +* Optimize database structure by adding foreign keys and indexes where needed * Introduce performance tests for Aggregator * Update KolektiMetricfu diff --git a/db/migrate/20160715192850_clean_inconsistencies.rb b/db/migrate/20160715192850_clean_inconsistencies.rb new file mode 100644 index 0000000..3a63eed --- /dev/null +++ b/db/migrate/20160715192850_clean_inconsistencies.rb @@ -0,0 +1,71 @@ +class CleanInconsistencies < ActiveRecord::Migration + def self.up + # Unset project reference for repositories with non-existing projects + execute <<-SQL + UPDATE repositories AS r + SET project_id = NULL + WHERE project_id = 0 OR NOT EXISTS ( + SELECT 1 FROM projects AS p WHERE p.id = r.project_id + ) + SQL + + # Delete processings with non-existing repositories + execute <<-SQL + DELETE FROM processings AS p + WHERE NOT EXISTS( + SELECT 1 FROM repositories AS r WHERE r.id = p.repository_id + ) + SQL + + # Delete process times with non-existing processings + execute <<-SQL + DELETE FROM process_times AS t + WHERE NOT EXISTS ( + SELECT 1 FROM processings AS p WHERE p.id = t.processing_id + ) + SQL + + # Delete module results with non-existing processings + execute <<-SQL + DELETE FROM module_results AS m + WHERE NOT EXISTS ( + SELECT 1 FROM processings AS p WHERE p.id = m.processing_id + ) + SQL + + # Delete kalibro modules with non-existing module results + execute <<-SQL + DELETE FROM kalibro_modules AS k + WHERE NOT EXISTS ( + SELECT 1 FROM module_results AS m WHERE m.id = k.module_result_id + ) + SQL + + # Delete metric results with non-existing module results + execute <<-SQL + DELETE FROM metric_results AS met + WHERE NOT EXISTS ( + SELECT 1 FROM module_results AS mod WHERE mod.id = met.module_result_id + ) + SQL + + # Delete duplicate metric_results. Group them by (module_result, metric_configuration), + # then delete all but the one with the highest ID + # The double wrapping on the inner query is necessary because window functions + # cannot be used in WHERE in PostgreSQL. + execute <<-SQL + DELETE FROM metric_results + WHERE id IN ( + SELECT t.id FROM ( + SELECT id, ROW_NUMBER() OVER (PARTITION BY module_result_id, metric_configuration_id ORDER BY id DESC) AS rnum + FROM metric_results + ) AS t + WHERE t.rnum > 1 + ) + SQL + end + + def self.down + raise ActiveRecord::IrreversibleMigration + end +end diff --git a/db/migrate/20160715192900_add_indexes_to_kalibro_modules.rb b/db/migrate/20160715192900_add_indexes_to_kalibro_modules.rb new file mode 100644 index 0000000..718ddd4 --- /dev/null +++ b/db/migrate/20160715192900_add_indexes_to_kalibro_modules.rb @@ -0,0 +1,6 @@ +class AddIndexesToKalibroModules < ActiveRecord::Migration + def change + add_foreign_key :kalibro_modules, :module_results, on_delete: :cascade + add_index :kalibro_modules, [:long_name, :granularity] + end +end diff --git a/db/migrate/20160715192901_add_indexes_to_module_results.rb b/db/migrate/20160715192901_add_indexes_to_module_results.rb new file mode 100644 index 0000000..2726718 --- /dev/null +++ b/db/migrate/20160715192901_add_indexes_to_module_results.rb @@ -0,0 +1,6 @@ +class AddIndexesToModuleResults < ActiveRecord::Migration + def change + add_foreign_key :module_results, :module_results, column: 'parent_id' + add_foreign_key :module_results, :processings, on_delete: :cascade + end +end diff --git a/db/migrate/20160715192902_add_indexes_to_metric_results.rb b/db/migrate/20160715192902_add_indexes_to_metric_results.rb new file mode 100644 index 0000000..cb6bc59 --- /dev/null +++ b/db/migrate/20160715192902_add_indexes_to_metric_results.rb @@ -0,0 +1,11 @@ +class AddIndexesToMetricResults < ActiveRecord::Migration + def change + add_foreign_key :metric_results, :module_results, on_delete: :cascade + add_index :metric_results, :type + add_index :metric_results, :module_result_id + add_index :metric_results, :metric_configuration_id + add_index :metric_results, [:module_result_id, :metric_configuration_id], + unique: true, where: "type = 'TreeMetricResult'", + name: 'metric_results_module_res_metric_cfg_uniq_idx' + end +end diff --git a/db/migrate/20160715192903_add_indexes_to_processings.rb b/db/migrate/20160715192903_add_indexes_to_processings.rb new file mode 100644 index 0000000..a0d1ebc --- /dev/null +++ b/db/migrate/20160715192903_add_indexes_to_processings.rb @@ -0,0 +1,6 @@ +class AddIndexesToProcessings < ActiveRecord::Migration + def change + add_foreign_key :processings, :repositories + add_foreign_key :processings, :module_results, column: 'root_module_result_id' + end +end diff --git a/db/migrate/20160715192904_add_indexes_to_process_times.rb b/db/migrate/20160715192904_add_indexes_to_process_times.rb new file mode 100644 index 0000000..1639f4b --- /dev/null +++ b/db/migrate/20160715192904_add_indexes_to_process_times.rb @@ -0,0 +1,5 @@ +class AddIndexesToProcessTimes < ActiveRecord::Migration + def change + add_foreign_key :process_times, :processings, on_delete: :cascade + end +end diff --git a/db/migrate/20160715192905_add_indexes_to_repositories.rb b/db/migrate/20160715192905_add_indexes_to_repositories.rb new file mode 100644 index 0000000..dcf6b8d --- /dev/null +++ b/db/migrate/20160715192905_add_indexes_to_repositories.rb @@ -0,0 +1,5 @@ +class AddIndexesToRepositories < ActiveRecord::Migration + def change + add_foreign_key :repositories, :projects + end +end diff --git a/db/migrate/20160716000521_remove_timestamps_from_module_results.rb b/db/migrate/20160716000521_remove_timestamps_from_module_results.rb new file mode 100644 index 0000000..eadb25b --- /dev/null +++ b/db/migrate/20160716000521_remove_timestamps_from_module_results.rb @@ -0,0 +1,6 @@ +class RemoveTimestampsFromModuleResults < ActiveRecord::Migration + def change + remove_column :module_results, :created_at, :string + remove_column :module_results, :updated_at, :string + end +end diff --git a/db/migrate/20160716000528_remove_timestamps_from_metric_results.rb b/db/migrate/20160716000528_remove_timestamps_from_metric_results.rb new file mode 100644 index 0000000..4401fa0 --- /dev/null +++ b/db/migrate/20160716000528_remove_timestamps_from_metric_results.rb @@ -0,0 +1,6 @@ +class RemoveTimestampsFromMetricResults < ActiveRecord::Migration + def change + remove_column :metric_results, :created_at, :string + remove_column :metric_results, :updated_at, :string + end +end diff --git a/db/migrate/20160716000530_remove_timestamps_from_kalibro_modules.rb b/db/migrate/20160716000530_remove_timestamps_from_kalibro_modules.rb new file mode 100644 index 0000000..e7d2abe --- /dev/null +++ b/db/migrate/20160716000530_remove_timestamps_from_kalibro_modules.rb @@ -0,0 +1,6 @@ +class RemoveTimestampsFromKalibroModules < ActiveRecord::Migration + def change + remove_column :kalibro_modules, :created_at, :string + remove_column :kalibro_modules, :updated_at, :string + end +end diff --git a/db/schema.rb b/db/schema.rb index f576490..d399861 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -11,7 +11,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema.define(version: 20151002172231) do +ActiveRecord::Schema.define(version: 20160716015915) do # These are extensions that must be enabled in order to support this database enable_extension "plpgsql" @@ -33,36 +33,38 @@ add_index "delayed_jobs", ["priority", "run_at"], name: "delayed_jobs_priority", using: :btree create_table "kalibro_modules", force: :cascade do |t| - t.string "long_name", limit: 255 - t.string "granularity", limit: 255 - t.datetime "created_at" - t.datetime "updated_at" - t.integer "module_result_id" + t.string "long_name", limit: 255 + t.string "granularity", limit: 255 + t.integer "module_result_id" end + add_index "kalibro_modules", ["long_name", "granularity"], name: "index_kalibro_modules_on_long_name_and_granularity", using: :btree + add_index "kalibro_modules", ["module_result_id"], name: "index_kalibro_modules_on_module_result_id", using: :btree + create_table "metric_results", force: :cascade do |t| - t.integer "module_result_id" - t.integer "metric_configuration_id" - t.float "value" - t.datetime "created_at" - t.datetime "updated_at" - t.string "type", default: "MetricResult", null: false - t.integer "line_number" - t.text "message" - t.integer "related_hotspot_metric_results_id" + t.integer "module_result_id" + t.integer "metric_configuration_id" + t.float "value" + t.string "type", default: "MetricResult", null: false + t.integer "line_number" + t.text "message" + t.integer "related_hotspot_metric_results_id" end + add_index "metric_results", ["metric_configuration_id"], name: "index_metric_results_on_metric_configuration_id", using: :btree + add_index "metric_results", ["module_result_id", "metric_configuration_id"], name: "metric_results_module_res_metric_cfg_uniq_idx", unique: true, where: "((type)::text = 'TreeMetricResult'::text)", using: :btree + add_index "metric_results", ["module_result_id"], name: "index_metric_results_on_module_result_id", using: :btree add_index "metric_results", ["related_hotspot_metric_results_id"], name: "index_metric_results_on_related_hotspot_metric_results_id", using: :btree + add_index "metric_results", ["type"], name: "index_metric_results_on_type", using: :btree create_table "module_results", force: :cascade do |t| - t.float "grade" - t.integer "parent_id" - t.datetime "created_at" - t.datetime "updated_at" - t.integer "processing_id" + t.float "grade" + t.integer "parent_id" + t.integer "processing_id" end add_index "module_results", ["parent_id"], name: "index_module_results_on_parent_id", using: :btree + add_index "module_results", ["processing_id"], name: "index_module_results_on_processing_id", using: :btree create_table "process_times", force: :cascade do |t| t.string "state", limit: 255 @@ -72,6 +74,8 @@ t.float "time" end + add_index "process_times", ["processing_id"], name: "index_process_times_on_processing_id", using: :btree + create_table "processings", force: :cascade do |t| t.string "state", limit: 255 t.integer "repository_id" @@ -81,6 +85,8 @@ t.text "error_message" end + add_index "processings", ["repository_id"], name: "index_processings_on_repository_id", using: :btree + create_table "projects", force: :cascade do |t| t.string "name", limit: 255 t.string "description", limit: 255 @@ -106,5 +112,15 @@ t.string "branch", default: "master", null: false end + add_index "repositories", ["project_id"], name: "index_repositories_on_project_id", using: :btree + + add_foreign_key "kalibro_modules", "module_results", on_delete: :cascade + add_foreign_key "metric_results", "module_results", on_delete: :cascade add_foreign_key "metric_results", "related_hotspot_metric_results", column: "related_hotspot_metric_results_id" + add_foreign_key "module_results", "module_results", column: "parent_id" + add_foreign_key "module_results", "processings", on_delete: :cascade + add_foreign_key "process_times", "processings", on_delete: :cascade + add_foreign_key "processings", "module_results", column: "root_module_result_id" + add_foreign_key "processings", "repositories" + add_foreign_key "repositories", "projects" end