-
Notifications
You must be signed in to change notification settings - Fork 457
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[GLUTEN-6368] Redact sensitive configs when calling gluten::printConfig
#6793
Changes from 4 commits
961c922
2bad963
e938ff7
497d6dc
37fc994
85e1818
2dd1db8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,12 +16,28 @@ | |
*/ | ||
|
||
#include <jni.h> | ||
|
||
#include <optional> | ||
#include <regex> | ||
#include "compute/ProtobufUtils.h" | ||
#include "config.pb.h" | ||
#include "jni/JniError.h" | ||
|
||
namespace { | ||
|
||
const std::string REGEX_REDACT_KEY = "spark.gluten.redaction.regex"; | ||
std::optional<std::regex> getRedactionRegex(const std::unordered_map<std::string, std::string>& conf) { | ||
auto it = conf.find(REGEX_REDACT_KEY); | ||
if (it != conf.end()) { | ||
return std::regex(it->second); | ||
} | ||
return std::nullopt; | ||
} | ||
} // namespace | ||
|
||
namespace gluten { | ||
|
||
const std::string REDACTED_VALUE = "*********(redacted)"; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto. |
||
|
||
std::unordered_map<std::string, std::string> | ||
parseConfMap(JNIEnv* env, const uint8_t* planData, const int32_t planDataLength) { | ||
std::unordered_map<std::string, std::string> sparkConfs; | ||
|
@@ -37,9 +53,17 @@ parseConfMap(JNIEnv* env, const uint8_t* planData, const int32_t planDataLength) | |
std::string printConfig(const std::unordered_map<std::string, std::string>& conf) { | ||
std::ostringstream oss; | ||
oss << std::endl; | ||
for (auto& [k, v] : conf) { | ||
oss << " [" << k << ", " << v << "]\n"; | ||
|
||
auto redactionRegex = getRedactionRegex(conf); | ||
|
||
for (const auto& [k, v] : conf) { | ||
if (redactionRegex && std::regex_match(k, *redactionRegex)) { | ||
oss << " [" << k << ", " << REDACTED_VALUE << "]\n"; | ||
} else { | ||
oss << " [" << k << ", " << v << "]\n"; | ||
} | ||
} | ||
return oss.str(); | ||
} | ||
|
||
} // namespace gluten |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -622,6 +622,7 @@ object GlutenConfig { | |
|
||
val GLUTEN_COST_EVALUATOR_ENABLED = "spark.gluten.sql.adaptive.costEvaluator.enabled" | ||
|
||
val GLUTEN_REGEX_LOG_REDACTION = "spark.gluten.redaction.regex" | ||
var ins: GlutenConfig = _ | ||
|
||
def getConf: GlutenConfig = { | ||
|
@@ -673,7 +674,8 @@ object GlutenConfig { | |
// gcs config | ||
SPARK_GCS_STORAGE_ROOT_URL, | ||
SPARK_GCS_AUTH_TYPE, | ||
SPARK_GCS_AUTH_SERVICE_ACCOUNT_JSON_KEYFILE | ||
SPARK_GCS_AUTH_SERVICE_ACCOUNT_JSON_KEYFILE, | ||
GLUTEN_REGEX_LOG_REDACTION | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ArnavBalyan, I think we should just use the existing redaction config key of Spark. Thus, this Spark config will be respected in Gluten. See the below example to pass Spark's session timezone: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, I was trying to do this, then realized some compatibility problems with C++ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ArnavBalyan, can we use boost::regex instead? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes we can use boost, there are still be parity differences, but these would be less common. Also just noticed the regex redact property is present in spark There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ArnavBalyan, yes, we cannot guarantee the compatibility for all regex pattern. But at least, this default regex should be supported in Gluten. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes theres differences in the c++ regex library, we could probably use a fallback to something sensible as a compromise. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the review @PHILO-HE @xushichangdesmond, I'll wire up the spark core config and use that by default, maybe we can improve this to fallback to gluten specific configs if needed in the future There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
could we overwrite Spark's default pattern with a valid regex patten in native? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are alot of incompatibilities, it's just one of the things where c++ and java don't have complete parity, I would rather, we provide limited support for the same config, or allow users to provide regex supported by c++ using a different properties. Option 1 is better for now and later we can add support |
||
) | ||
nativeConfMap.putAll(conf.filter(e => keys.contains(e._1)).asJava) | ||
|
||
|
@@ -757,7 +759,8 @@ object GlutenConfig { | |
GLUTEN_TASK_OFFHEAP_SIZE_IN_BYTES_KEY, | ||
GLUTEN_OFFHEAP_ENABLED, | ||
SESSION_LOCAL_TIMEZONE.key, | ||
DECIMAL_OPERATIONS_ALLOW_PREC_LOSS.key | ||
DECIMAL_OPERATIONS_ALLOW_PREC_LOSS.key, | ||
GLUTEN_REGEX_LOG_REDACTION | ||
) | ||
nativeConfMap.putAll(conf.filter(e => keys.contains(e._1)).asJava) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
move config key definition to GlutenConfig.h
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
and follow others naming.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This will be changed, we will no longer rely on this config
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
All configs from GlutenConfig.scala should mapping to GlutenConfig.h/VeloxConfig.h, let's keep config consistent.