From 9c2344840e7af1bcb6411bc3e2e29b2b0e55e776 Mon Sep 17 00:00:00 2001 From: cloud-profiler-team Date: Tue, 3 Jan 2023 19:40:44 -0800 Subject: [PATCH] fix: update code for breaking changes in Python 3.11 Fixes #127. Struct members were hidden from PyFrameObject and PyThreadState. See https://docs.python.org/3/whatsnew/3.11.html#pyframeobject-3-11-hiding. This PR is passing public and internal tests. However, I am not particularly confident that this fix is safe/correct. The handler operates possibly without the GIL, as it does not currently modify any of the PyObjects it works with. It gets the thread local ThreadState with `PyGILState_GetThisThreadState()`. - `PyCodeObject`s may be GC'ed while the handler is running, but we get around that by patching the PyCode deallocator function [`PyCode_Type.tp_dealloc = &CodeDealloc`](https://github.com/GoogleCloudPlatform/cloud-profiler-python/blob/ae0c31b7ed962c8158e8a24ba44be6442c7374f8/googlecloudprofiler/src/profiler.h#L60) The problem is that the getters described in [the changelog](https://docs.python.org/3/whatsnew/3.11.html#pyframeobject-3-11-hiding) (`PyThreadState_GetFrame()`, `PyFrame_GetCode()`, `PyFrame_GetBack()`) increment the refcount for those PyObjects which afaict is NOT safe to do without holding the GIL. Trying to acquire the GIL in the handler seems to deadlock some of the internal integration tests; I'm also not confident that this would be [async-signal-safe](https://man7.org/linux/man-pages/man7/signal-safety.7.html). Folks are welcome to try the profiler in Python 3.11 from this PR but I am not comfortable merging it as is. If you do try it, please report back on if you encountered any issues. PiperOrigin-RevId: 499370977 --- googlecloudprofiler/src/profiler.cc | 42 ++++++++++++++++++++++++++--- kokoro/integration_test.go | 2 +- kokoro/integration_test.sh | 2 +- 3 files changed, 40 insertions(+), 6 deletions(-) diff --git a/googlecloudprofiler/src/profiler.cc b/googlecloudprofiler/src/profiler.cc index 08d946c..f8e1d51 100644 --- a/googlecloudprofiler/src/profiler.cc +++ b/googlecloudprofiler/src/profiler.cc @@ -22,6 +22,7 @@ #include #include #include +#include #include "clock.h" #include "log.h" @@ -42,7 +43,14 @@ struct PyObjectDecReffer { } }; +struct PyFrameObjectDecReffer { + void operator()(PyFrameObject *py_frame_object) const { + Py_XDECREF(py_frame_object); + } +}; + typedef std::unique_ptr PyObjectRef; +typedef std::unique_ptr PyFrameObjectRef; // Helper class to store and reset errno when in a signal handler. class ErrnoRaii { @@ -58,6 +66,23 @@ class ErrnoRaii { int stored_errno_; }; +// Code defining a few frame related methods on Python 3.8 and older, copied +// from https://docs.python.org/3/whatsnew/3.11.html#pyframeobject-3-11-hiding +#if PY_VERSION_HEX < 0x030900B1 +static inline PyFrameObject *PyThreadState_GetFrame(PyThreadState *tstate) { + Py_XINCREF(tstate->frame); + return tstate->frame; +} +static inline PyCodeObject *PyFrame_GetCode(PyFrameObject *frame) { + Py_INCREF(frame->f_code); + return frame->f_code; +} +static inline PyFrameObject *PyFrame_GetBack(PyFrameObject *frame) { + Py_XINCREF(frame->f_back); + return frame->f_back; +} +#endif + } // namespace destructor CodeDeallocHook::old_code_dealloc_ = nullptr; @@ -157,13 +182,22 @@ void Profiler::Handle(int signum, siginfo_t *info, void *context) { } else { // We are running in the context of the thread interrupted by the signal // so the frame object for the current thread is stable. - PyFrameObject *frame = ts->frame; + PyFrameObjectRef frame(PyThreadState_GetFrame(ts)); + int num_frames = 0; while (frame != nullptr && num_frames < kMaxFramesToCapture) { - frames[num_frames].lineno = frame->f_lineno; - frames[num_frames].py_code = frame->f_code; + frames[num_frames].lineno = PyFrame_GetLineNumber(frame.get()); + + // Profiler treats CallFrame.py_code as a weak pointer, with + // CodeDeallocHook tracking which code objects have been deallocated. + // Immediately DECREF the PyCodeObject to make it a weak pointer which may + // be safely GC'ed at any time. + PyCodeObject *code = PyFrame_GetCode(frame.get()); + Py_DECREF(code); + frames[num_frames].py_code = code; + num_frames++; - frame = frame->f_back; + frame.reset(PyFrame_GetBack(frame.get())); } trace.num_frames = num_frames; } diff --git a/kokoro/integration_test.go b/kokoro/integration_test.go index c56646c..e43569c 100644 --- a/kokoro/integration_test.go +++ b/kokoro/integration_test.go @@ -359,7 +359,7 @@ func generateTestCases(projectID, zone string) []testCase { }, } - for _, minorVersion := range []int{7, 8, 9, 10} { + for _, minorVersion := range []int{7, 8, 9, 10, 11} { tcs = append(tcs, testCase{ InstanceConfig: proftest.InstanceConfig{ ProjectID: projectID, diff --git a/kokoro/integration_test.sh b/kokoro/integration_test.sh index 1270c43..0c3aff5 100644 --- a/kokoro/integration_test.sh +++ b/kokoro/integration_test.sh @@ -70,7 +70,7 @@ go mod init e2e # Compile test before running to download dependencies. retry go get cloud.google.com/go/profiler/proftest@HEAD retry go test -c -./e2e.test -gcs_location="${GCS_LOCATION}" -run_backoff_test=$RUN_BACKOFF_TEST +./e2e.test -test.v -gcs_location="${GCS_LOCATION}" -run_backoff_test=$RUN_BACKOFF_TEST # Exit with success code if no need to release the agent. if [[ "$KOKORO_JOB_TYPE" != "RELEASE" ]]; then