diff --git a/src/main/org/deidentifier/arx/ARXConfiguration.java b/src/main/org/deidentifier/arx/ARXConfiguration.java index b464ee88f4..79ea83a7c7 100644 --- a/src/main/org/deidentifier/arx/ARXConfiguration.java +++ b/src/main/org/deidentifier/arx/ARXConfiguration.java @@ -1222,7 +1222,9 @@ protected ARXConfiguration getInstanceForLocalRecoding(RowSet rowset, double gsF result.aCriteria = null; result.criteria = criteria; MetricConfiguration utilityConfig = result.getQualityModel().getConfiguration(); - utilityConfig.setGsFactor(gsFactor); + if (!Double.isNaN(gsFactor)) { + utilityConfig.setGsFactor(gsFactor); + } result.metric = result.getQualityModel().getDescription().createInstance(utilityConfig); // Return diff --git a/src/main/org/deidentifier/arx/ARXResult.java b/src/main/org/deidentifier/arx/ARXResult.java index 3185403d5f..e672da56ce 100644 --- a/src/main/org/deidentifier/arx/ARXResult.java +++ b/src/main/org/deidentifier/arx/ARXResult.java @@ -489,10 +489,57 @@ public void progress(double progress) { } }); } + + /** + * This method optimizes the given data output with local recoding to improve its utility + * @param handle + * @param gsFactor A factor [0,1] weighting generalization and suppression. + * The default value is 0.5, which means that generalization + * and suppression will be treated equally. A factor of 0 + * will favor suppression, and a factor of 1 will favor + * generalization. The values in between can be used for + * balancing both methods. + * @param listener + * @return The number of optimized records + */ + public int optimize(DataHandle handle, double gsFactor, ARXListener listener) throws RollbackRequiredException { + return optimizeFast(handle, gsFactor, Double.NaN, listener); + } /** * This method optimizes the given data output with local recoding to improve its utility * @param handle + * @param records A fraction [0,1] of records that need to be optimized. + * @return The number of optimized records + */ + public int optimizeFast(DataHandle handle, + double records) throws RollbackRequiredException { + return optimizeFast(handle, records, Double.NaN, new ARXListener(){ + @Override + public void progress(double progress) { + // Empty by design + } + }); + } + + /** + * This method optimizes the given data output with local recoding to improve its utility + * @param handle + * @param records A fraction [0,1] of records that need to be optimized. + * @param listener + * @return The number of optimized records + */ + public int optimizeFast(DataHandle handle, + double records, + ARXListener listener) throws RollbackRequiredException { + return optimizeFast(handle, records, Double.NaN, listener); + } + + + /** + * This method optimizes the given data output with local recoding to improve its utility + * @param handle + * @param records A fraction [0,1] of records that need to be optimized. * @param gsFactor A factor [0,1] weighting generalization and suppression. * The default value is 0.5, which means that generalization * and suppression will be treated equally. A factor of 0 @@ -502,7 +549,10 @@ public void progress(double progress) { * @param listener * @return The number of optimized records */ - public int optimize(DataHandle handle, double gsFactor, ARXListener listener) throws RollbackRequiredException { + public int optimizeFast(DataHandle handle, + double records, + double gsFactor, + ARXListener listener) throws RollbackRequiredException { // Check if null if (listener == null) { @@ -515,7 +565,12 @@ public int optimize(DataHandle handle, double gsFactor, ARXListener listener) th } // Check bounds - if (gsFactor < 0d || gsFactor > 1d) { + if (!Double.isNaN(records) && (records < 0d || records > 1d)) { + throw new IllegalArgumentException("Number of records to optimize must be in [0, 1]"); + } + + // Check bounds + if (!Double.isNaN(gsFactor) && (gsFactor < 0d || gsFactor > 1d)) { throw new IllegalArgumentException("Generalization/suppression factor must be in [0, 1]"); } @@ -555,7 +610,14 @@ public int optimize(DataHandle handle, double gsFactor, ARXListener listener) th // - Subsets will be projected accordingly // - Utility measures will be cloned ARXConfiguration config = this.config.getInstanceForLocalRecoding(rowset, gsFactor); - + if (!Double.isNaN(records)) { + double absoluteRecords = records * handle.getNumRows(); + double relativeRecords = absoluteRecords / (double)rowset.size(); + relativeRecords = relativeRecords < 0d ? 0d : relativeRecords; + relativeRecords = relativeRecords > 1d ? 1d : relativeRecords; + config.setMaxOutliers(1d - relativeRecords); + } + // In the data definition, only microaggregation functions maintain a state, but these // are cloned, when cloning the definition // TODO: This is probably not necessary, because they are used from the data manager, @@ -641,7 +703,6 @@ public int optimize(DataHandle handle, double gsFactor, ARXListener listener) th * balancing both methods. * @param maxIterations The maximal number of iterations to perform * @param adaptionFactor Is added to the gsFactor when reaching a fixpoint - * @param listener * @throws RollbackRequiredException */ public void optimizeIterative(DataHandle handle, @@ -686,13 +747,19 @@ public void optimizeIterative(final DataHandle handle, throw new IllegalArgumentException("Max. iterations must be > zero"); } + // Prepare + int iterationsTotal = 0; + int optimizedCurrent = Integer.MAX_VALUE; + int optimizedTotal = 0; + int optimizedGoal = 0; + for (int row = 0; row < handle.getNumRows(); row++) { + optimizedGoal += handle.isOutlier(row) ? 1 : 0; + } + // Progress listener.progress(0d); - + // Outer loop - int optimizedTotal = 0; - int iterationsTotal = 0; - int optimizedCurrent = Integer.MAX_VALUE; while (isOptimizable(handle) && iterationsTotal < maxIterations && optimizedCurrent > 0) { // Perform individual optimization @@ -711,7 +778,7 @@ public void optimizeIterative(final DataHandle handle, iterationsTotal++; // Progress - double progress1 = (double)optimizedTotal / (double)handle.getNumRows(); + double progress1 = (double)optimizedTotal / (double)optimizedGoal; double progress2 = (double)iterationsTotal / (double)maxIterations; listener.progress(Math.max(progress1, progress2)); } @@ -719,6 +786,95 @@ public void optimizeIterative(final DataHandle handle, // Progress listener.progress(1d); } + + /** + * This method optimizes the given data output with local recoding to improve its utility + * @param handle + * @param records A fraction [0,1] of records that need to be optimized in each step. + * @throws RollbackRequiredException + */ + public void optimizeIterativeFast(DataHandle handle, + double records) throws RollbackRequiredException { + this.optimizeIterativeFast(handle, records, Double.NaN, new ARXListener(){ + @Override + public void progress(double progress) { + // Empty by design + } + }); + } + + /** + * This method optimizes the given data output with local recoding to improve its utility + * @param handle + * @param records A fraction [0,1] of records that need to be optimized in each step. + * @param listener + * @throws RollbackRequiredException + */ + public void optimizeIterativeFast(DataHandle handle, + double records, + ARXListener listener) throws RollbackRequiredException { + this.optimizeIterativeFast(handle, records, Double.NaN, listener); + } + + /** + * This method optimizes the given data output with local recoding to improve its utility + * @param handle + * @param records A fraction [0,1] of records that need to be optimized in each step. + * @param gsFactor A factor [0,1] weighting generalization and suppression. + * The default value is 0.5, which means that generalization + * and suppression will be treated equally. A factor of 0 + * will favor suppression, and a factor of 1 will favor + * generalization. The values in between can be used for + * balancing both methods. + * @param listener + * @throws RollbackRequiredException + */ + public void optimizeIterativeFast(final DataHandle handle, + double records, + double gsFactor, + final ARXListener listener) throws RollbackRequiredException { + + if (!Double.isNaN(gsFactor) && (gsFactor < 0d || gsFactor > 1d)) { + throw new IllegalArgumentException("Generalization/suppression factor must be in [0, 1]"); + } + if (records < 0d || records > 1d) { + throw new IllegalArgumentException("Number of records to optimize must be in [0, 1]"); + } + + // Prepare + int optimizedCurrent = Integer.MAX_VALUE; + int optimizedTotal = 0; + int optimizedGoal = 0; + for (int row = 0; row < handle.getNumRows(); row++) { + optimizedGoal += handle.isOutlier(row) ? 1 : 0; + } + + // Progress + listener.progress(0d); + + // Outer loop + while (isOptimizable(handle) && optimizedCurrent > 0) { + + // Progress + final double minProgress = (double)optimizedTotal / (double)optimizedGoal; + final double maxProgress = minProgress + records; + + // Perform individual optimization + optimizedCurrent = optimizeFast(handle, records, gsFactor, new ARXListener() { + @Override + public void progress(double progress) { + listener.progress(minProgress + progress * (maxProgress - minProgress)); + } + }); + optimizedTotal += optimizedCurrent; + + // Progress + listener.progress((double)optimizedTotal / (double)optimizedGoal); + } + + // Progress + listener.progress(1d); + } /** * Returns a map of all microaggregation functions