-
Notifications
You must be signed in to change notification settings - Fork 170
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Send gRPC errors properly #1983
Changes from 5 commits
2afd8c7
c7ea3c0
a8a79ea
0e72e46
6dd1b17
cd62f5f
4128a71
5537eae
7467697
4ddb2ab
1ddedde
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,8 +2,8 @@ package misk.grpc | |
|
||
import com.google.inject.Guice | ||
import com.google.inject.util.Modules | ||
import javax.inject.Inject | ||
import javax.inject.Named | ||
import com.squareup.wire.GrpcException | ||
import com.squareup.wire.GrpcStatus | ||
import kotlinx.coroutines.GlobalScope | ||
import kotlinx.coroutines.channels.ReceiveChannel | ||
import kotlinx.coroutines.runBlocking | ||
|
@@ -13,6 +13,7 @@ import misk.grpc.miskserver.RouteChatGrpcAction | |
import misk.grpc.miskserver.RouteGuideMiskServiceModule | ||
import misk.inject.getInstance | ||
import misk.logging.LogCollectorModule | ||
import misk.metrics.FakeMetrics | ||
import misk.testing.MiskTest | ||
import misk.testing.MiskTestModule | ||
import misk.web.interceptors.RequestLoggingInterceptor | ||
|
@@ -25,6 +26,9 @@ import routeguide.Point | |
import routeguide.RouteGuideClient | ||
import routeguide.RouteNote | ||
import wisp.logging.LogCollector | ||
import javax.inject.Inject | ||
import javax.inject.Named | ||
import kotlin.test.assertFailsWith | ||
|
||
@MiskTest(startService = true) | ||
class MiskClientMiskServerTest { | ||
|
@@ -37,6 +41,7 @@ class MiskClientMiskServerTest { | |
@Inject lateinit var logCollector: LogCollector | ||
@Inject lateinit var routeChatGrpcAction: RouteChatGrpcAction | ||
@Inject @field:Named("grpc server") lateinit var serverUrl: HttpUrl | ||
@Inject lateinit var metrics: FakeMetrics | ||
|
||
private lateinit var routeGuide: RouteGuideClient | ||
private lateinit var callCounter: RouteGuideCallCounter | ||
|
@@ -105,4 +110,61 @@ class MiskClientMiskServerTest { | |
sendChannel.close() | ||
} | ||
} | ||
|
||
@Test | ||
fun serverFailureGeneric() { | ||
val point = Point( | ||
latitude = -1, | ||
longitude = 500 | ||
) | ||
|
||
runBlocking { | ||
val e = assertFailsWith<GrpcException> { | ||
routeGuide.GetFeature().execute(point) | ||
} | ||
assertThat(e.grpcMessage).isEqualTo("unexpected latitude error!") | ||
assertThat(e.grpcStatus).isEqualTo(GrpcStatus.UNKNOWN) | ||
|
||
// Assert that _metrics_ counted a 500 and no 200s, even though an HTTP 200 was returned | ||
// over HTTP. The 200 is implicitly asserted by the fact that we got a GrpcException, which | ||
// is only thrown if a properly constructed gRPC error is received. | ||
assertResponseCount(200, 0) | ||
assertResponseCount(500, 1) | ||
Comment on lines
+139
to
+140
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test is broken; will approve when it passes. Also, nit: add code comment about why these are the expected metrics There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you be more specific about what you mean by "broken" here? The assertion is about what's counted in metrics, which we very much do want to be what's asserted here. See the discussion in #1983 (comment) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah I saw a test failure here, but it looks like it's working now. |
||
} | ||
} | ||
|
||
@Test | ||
fun serverFailureNotFound() { | ||
val point = Point( | ||
latitude = -1, | ||
longitude = 404 | ||
) | ||
|
||
runBlocking { | ||
val e = assertFailsWith<GrpcException> { | ||
routeGuide.GetFeature().execute(point) | ||
} | ||
assertThat(e.grpcMessage).isEqualTo("unexpected latitude error!") | ||
assertThat(e.grpcStatus).isEqualTo(GrpcStatus.UNIMPLEMENTED) | ||
.withFailMessage("wrong gRPC status ${e.grpcStatus.name}") | ||
|
||
// Assert that _metrics_ counted a 404 and no 200s, even though an HTTP 200 was returned | ||
// over HTTP. The 200 is implicitly asserted by the fact that we got a GrpcException, which | ||
// is only thrown if a properly constructed gRPC error is received. | ||
assertResponseCount(200, 0) | ||
assertResponseCount(404, 1) | ||
} | ||
} | ||
|
||
private fun assertResponseCount(code: Int, count: Int) { | ||
val responseCount = metrics.histogramCount( | ||
"http_request_latency_ms", | ||
"action" to "GetFeatureGrpcAction", | ||
"caller" to "unknown", | ||
"code" to code.toString(), | ||
)?.toInt() ?: 0 | ||
assertThat(responseCount) | ||
.withFailMessage("Expected metrics to indicate $count responses with HTTP status $code but got $responseCount") | ||
.isEqualTo(count) | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,17 +1,25 @@ | ||
package misk.web.exceptions | ||
|
||
import com.google.common.util.concurrent.UncheckedExecutionException | ||
import com.squareup.wire.GrpcStatus | ||
import com.squareup.wire.ProtoAdapter | ||
import misk.Action | ||
import misk.exceptions.StatusCode | ||
import misk.exceptions.UnauthenticatedException | ||
import misk.exceptions.UnauthorizedException | ||
import misk.grpc.GrpcMessageSink | ||
import misk.web.DispatchMechanism | ||
import misk.web.HttpCall | ||
import misk.web.NetworkChain | ||
import misk.web.NetworkInterceptor | ||
import misk.web.Response | ||
import misk.web.ResponseBody | ||
import misk.web.mediatype.MediaTypes | ||
import misk.web.toResponseBody | ||
import okhttp3.Headers.Companion.toHeaders | ||
import okio.Buffer | ||
import okio.BufferedSink | ||
import okio.ByteString | ||
import wisp.logging.getLogger | ||
import wisp.logging.log | ||
import java.lang.reflect.InvocationTargetException | ||
|
@@ -37,13 +45,67 @@ class ExceptionHandlingInterceptor( | |
} catch (th: Throwable) { | ||
val response = toResponse(th) | ||
chain.httpCall.statusCode = response.statusCode | ||
chain.httpCall.takeResponseBody()?.use { sink -> | ||
chain.httpCall.addResponseHeaders(response.headers) | ||
(response.body as ResponseBody).writeTo(sink) | ||
if (chain.httpCall.dispatchMechanism == DispatchMechanism.GRPC) { | ||
sendGrpcFailure(chain.httpCall, response) | ||
} else { | ||
sendHttpFailure(chain.httpCall, response) | ||
} | ||
} | ||
} | ||
|
||
private fun sendHttpFailure(httpCall: HttpCall, response: Response<*>) { | ||
httpCall.takeResponseBody()?.use { sink -> | ||
httpCall.addResponseHeaders(response.headers) | ||
(response.body as ResponseBody).writeTo(sink) | ||
} | ||
} | ||
|
||
/** | ||
* Borrow behavior from [GrpcFeatureBinding] to send a gRPC error with an HTTP 200 status code. | ||
* This is weird but it's how gRPC clients work. | ||
* | ||
* One thing to note is for our metrics we want to pretend that the HTTP code is what we sent. | ||
* Otherwise gRPC requests that crashed and yielded an HTTP 200 code will confuse operators. | ||
*/ | ||
Comment on lines
+63
to
+69
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see that this will reduce disruption with metrics, but I feel like having two sets of HTTP status codes is confusing in the long run, and it adds a layer of indirection with gRPC statuses. Why not have a metric for different gRPC statuses instead? To provide a good migration path, we can probably emit both HTTP and gRPC status code metrics, then turn off HTTP status code metrics for gRPC endpoints later. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Suggest new metrics that look like these, but no real need to make them identical I think? https://github.com/grpc-ecosystem/java-grpc-prometheus There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (fwiw, this is fine to keep as is in this PR) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. gRPC is always 200 OK, even when it’s not okay, and that is awful and a guarantee that there will be confusion. My preference is that we try to hide the insincere 200 OKs in implementation details. Metrics and alerting and even just developer expectations are pretty invested in 200/400/500 meaning different things. The 200 is a lie, and we want to quietly send it over the Wire without spreading that lie to other systems. I think trying to get all those downstream tools to have independent HTTP and gRPC modes only spreads the confusion. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I get that gRPC errors being HTTP 200 is confusing. Thinking about it more, my concerns are a) HTTP 4xx/5xx and gRPC have a non-1:1 mapping with different semantics, and b) hiding the HTTP and gRPC status makes it a bit more difficult to trace requests when debugging (e.g. the server said 502 in its logs, but the client got a 200). I see what you mean with metrics though, and am ok with having a mapping for metrics purposes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree we should emit some gRPC-specific metrics to improve observability, but I think we can do that as a follow-up PR (#1992). This PR strictly improves the situation from "Misk counts 4xx/5xx metrics but breaks gRPC clients" to "Misk counts 4xx/5xx metrics and doesn't break gRPC clients". The third option, of "Misk fails to count 4xx/5xx metrics for gRPC but clients get the right error responses" seems strictly worse than either of those to me. |
||
private fun sendGrpcFailure(httpCall: HttpCall, response: Response<*>) { | ||
httpCall.networkStatusCode = 200 | ||
httpCall.requireTrailers() | ||
httpCall.setResponseHeader("grpc-encoding", "identity") | ||
httpCall.setResponseHeader("Content-Type", MediaTypes.APPLICATION_GRPC) | ||
httpCall.setResponseTrailer( | ||
"grpc-status", | ||
toGrpcStatus(response.statusCode).code.toString() | ||
) | ||
httpCall.setResponseTrailer("grpc-message", this.grpcMessage(response)) | ||
httpCall.takeResponseBody()?.use { responseBody: BufferedSink -> | ||
GrpcMessageSink(responseBody, ProtoAdapter.BYTES, grpcEncoding = "identity") | ||
.use { messageSink -> | ||
messageSink.write(ByteString.EMPTY) | ||
} | ||
} | ||
} | ||
|
||
private fun grpcMessage(response: Response<*>): String { | ||
val buffer = Buffer() | ||
(response.body as ResponseBody).writeTo(buffer) | ||
return buffer.readUtf8() | ||
} | ||
|
||
/** https://grpc.github.io/grpc/core/md_doc_http-grpc-status-mapping.html */ | ||
private fun toGrpcStatus(statusCode: Int): GrpcStatus { | ||
return when (statusCode) { | ||
400 -> GrpcStatus.INTERNAL | ||
401 -> GrpcStatus.UNAUTHENTICATED | ||
403 -> GrpcStatus.PERMISSION_DENIED | ||
404 -> GrpcStatus.UNIMPLEMENTED | ||
429 -> GrpcStatus.UNAVAILABLE | ||
502 -> GrpcStatus.UNAVAILABLE | ||
503 -> GrpcStatus.UNAVAILABLE | ||
504 -> GrpcStatus.UNAVAILABLE | ||
else -> GrpcStatus.UNKNOWN | ||
} | ||
} | ||
|
||
private fun toResponse(th: Throwable): Response<*> = when (th) { | ||
is UnauthenticatedException -> UNAUTHENTICATED_RESPONSE | ||
is UnauthorizedException -> UNAUTHORIZED_RESPONSE | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This feels so hacky lol. How about a separate RPC that always fails?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm pretty sure that the
RouteGuide
service is borrowed from the gRPC reference implementation, so I'm loathe to extend that with another method just for this sort of test. Given that it's strictly test code, I'm not sure I agree that it's worth doing in a perfect way.@swankjesse what do you think?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Happy as is.