diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml new file mode 100644 index 00000000000..dfd24c8d07e --- /dev/null +++ b/.github/workflows/build.yaml @@ -0,0 +1,201 @@ +name: 'Build and Push Kolla Images' + +on: + push: + branches: + - main + - staging + - qa + - dev + - experimental + + workflow_dispatch: + inputs: + branch: + type: choice + description: 'Branch to build' + options: + - main + - staging + - qa + - dev + - experimental + +permissions: + contents: read + id-token: write + +concurrency: + group: '${{ github.workflow }} @ ${{ github.repository }} @ ${{ github.event.inputs.branch || github.base_ref || github.ref_name }}' + cancel-in-progress: true + +jobs: + kolla_build: + runs-on: + group: prod + + steps: + + - uses: QumulusTechnology/vault-setup-action@v2 + with: + aws_account_data: ${{ secrets.AWS_ACCOUNT_DATA }} + vault_addr: ${{ secrets.VAULT_ADDR }} + platform: qcp + secrets: | + secret/data/qcp/global/harbor/prod/github-user username | REPO_USERNAME ; + secret/data/qcp/global/harbor/prod/github-user password | REPO_PASSWORD ; + secret/data/qcp/global/harbor/prod/github-user address | REPO_ADDRESS + + - name: Checkout + uses: actions/checkout@v4 + with: + ref: ${{ env.BRANCH }} + + - name: Set Release version + run: | + + if [ "$BRANCH" == "main" ]; then + echo "OPENSTACK_VERSION=2025.1" >> $GITHUB_ENV + echo "BASE_OS_TAG=24.04" >> $GITHUB_ENV + elif [ "$BRANCH" == "staging" ]; then + echo "OPENSTACK_VERSION=2025.1" >> $GITHUB_ENV + echo "BASE_OS_TAG=24.04" >> $GITHUB_ENV + elif [ "$BRANCH" == "qa" ]; then + echo "OPENSTACK_VERSION=2025.1" >> $GITHUB_ENV + echo "BASE_OS_TAG=24.04" >> $GITHUB_ENV + elif [ "$BRANCH" == "experimental" ]; then + echo "OPENSTACK_VERSION=2025.1" >> $GITHUB_ENV + echo "BASE_OS_TAG=24.04" >> $GITHUB_ENV + else + echo "OPENSTACK_VERSION=2025.1" >> $GITHUB_ENV + echo "BASE_OS_TAG=24.04" >> $GITHUB_ENV + fi + + + if [ "$BRANCH" == "main" ]; then + TAG=main + elif [ "$BRANCH" == "staging" ]; then + TAG=staging + elif [ "$BRANCH" == "qa" ]; then + TAG=qa + elif [ "$BRANCH" == "dev" ]; then + TAG=dev + elif [ "$BRANCH" == "experimental" ]; then + TAG=experimental + else + TAG=dev_${BRANCH} + fi + + echo "TAG=$TAG" >> $GITHUB_ENV + + - name: Set GITHUB Environment Variables + run: | + echo "GITHUB_ACTIONS_BRANCH=${{ github.base_ref || github.ref_name }}" >> $GITHUB_ENV + echo "GITHUB_ACTIONS_WORKFLOW_ID=${{ github.run_id }}" >> $GITHUB_ENV + echo "GITHUB_ACTIONS_WORKFLOW_ATEMPT=${{ github.run_attempt }}" >> $GITHUB_ENV + echo "GITHUB_ACTIONS_WORKFLOW_RUN_NUMBER=${{ github.run_number }}" >> $GITHUB_ENV + echo "GITHUB_ACTIONS_AUTHOR=${{ github.actor }}" >> $GITHUB_ENV + + - name: Login to Harbor Hub + uses: docker/login-action@v2 + with: + registry: ${{ env.REPO_ADDRESS }} + username: ${{ env.REPO_USERNAME }} + password: ${{ env.REPO_PASSWORD }} + + - name: Install Kolla + run: | + sudo apt update + sudo apt install -y bash python3 python3-pip git python3-dev python3-docker libffi-dev gcc libssl-dev python3-venv + python3 -m venv kolla + source kolla/bin/activate + pip install -U pip + pip install docker setuptools + pip install git+https://github.com/QumulusTechnology/kolla@${BRANCH} + + - name: Create kolla-build.conf + run: | + CWD="$(pwd)" + sudo mkdir -p /etc/kolla + sudo bash -c "cat << EOF > /etc/kolla/kolla-build.conf + + [DEFAULT] + base = ubuntu + namespace = kolla + base_tag = ${BASE_OS_TAG} + openstack_release = ${OPENSTACK_VERSION} + registry = ${REPO_ADDRESS}/qcp-${BRANCH} + push = true + skip_existing = false + threads = 16 + push_threads = 4 + install_type = source + tag = latest + template_override = /etc/kolla/template-overrides.j2 + docker_healthchecks = true + network_mode = host + + [${REPOSITORY}-base] + type = local + location = ${CWD} + EOF" + + + - name: Create template-overrides.j2 + run: | + sudo bash -c "cat << EOF > /etc/kolla/template-overrides.j2 + {% extends parent_template %} + + {% block base_ubuntu_package_sources_list %} + {% endblock %} + + {% block openstack_base_footer %} + RUN pip install jaeger-client + {% endblock %} + + {% block ${REPOSITORY}_base_footer %} + {% endblock %} + + {% block nova_compute_header %} + RUN apt clean && \ + apt update && \ + apt install -y swtpm swtpm-tools && \ + groupadd tss && \ + useradd -g tss tss && \ + mkdir /home/tss && \ + chown -R tss:tss /home/tss &&\ + chown -R tss:tss /var/lib/swtpm-localca + {% endblock %} + + {% block nova_compute_footer %} + RUN pip install git+https://github.com/openstack/oslo.utils.git@stable/${OPENSTACK_VERSION} + {% endblock %} + + {% block nova_libvirt_footer %} + RUN apt clean && \ + apt update && \ + apt install -y swtpm swtpm-tools && \ + mkdir /home/tss && \ + chown -R tss:tss /home/tss && \ + chown -R tss:tss /var/lib/swtpm-localca + + USER tss + RUN /usr/share/swtpm/swtpm-create-user-config-files + USER root + {% endblock %} + + EOF" + + - name: Build docker images + run: | + source kolla/bin/activate + kolla-build ${REPOSITORY} + + - name: Tag and push docker images + run: | + timestamp=$(date +%Y%m%d%H%M%S) + for i in $(docker images --format '{{.Repository}}' | grep "/qcp-${BRANCH}/"); do + image=${i##*/} + docker tag ${i}:latest ${REPO_ADDRESS}/qcp-${BRANCH}/kolla/${image}:${timestamp} + docker image push --all-tags ${i} + done diff --git a/.gitreview b/.gitreview index c2b7eef7078..bc50fb9256a 100644 --- a/.gitreview +++ b/.gitreview @@ -2,3 +2,4 @@ host=review.opendev.org port=29418 project=openstack/nova.git +defaultbranch=stable/2025.1 diff --git a/api-ref/source/parameters.yaml b/api-ref/source/parameters.yaml index 9a530cffbe7..f644c84f9ea 100644 --- a/api-ref/source/parameters.yaml +++ b/api-ref/source/parameters.yaml @@ -5721,6 +5721,7 @@ pinned_availability_zone: Also when default_schedule_zone config option set to specific AZ, in that case, instance would be pinned to that specific AZ, and instance will be scheduled on host belonging to pinned AZ. + In case of no pinned availability zone, this value is set to `null`. in: body type: string min_version: 2.96 diff --git a/nova/api/openstack/compute/views/servers.py b/nova/api/openstack/compute/views/servers.py index 8452d6bb102..023669406eb 100644 --- a/nova/api/openstack/compute/views/servers.py +++ b/nova/api/openstack/compute/views/servers.py @@ -14,8 +14,6 @@ # License for the specific language governing permissions and limitations # under the License. -import collections - from oslo_log import log as logging from oslo_serialization import jsonutils @@ -41,6 +39,7 @@ LOG = logging.getLogger(__name__) +AZ_NOT_IN_REQUEST_SPEC = object() SCHED_HINTS_NOT_IN_REQUEST_SPEC = object() @@ -225,16 +224,20 @@ def _get_host_status_unknown_only(context, instance=None): return unknown_only def _get_pinned_az(self, context, instance, provided_az): - pinned_az = '' - if provided_az is not None: + if provided_az is AZ_NOT_IN_REQUEST_SPEC: + # Case the provided_az is pre fetched, but not specified + pinned_az = None + elif provided_az is not None: + # Case the provided_az is pre fetched, and specified pinned_az = provided_az else: + # Case the provided_az is not pre fethed. try: req_spec = objects.RequestSpec.get_by_instance_uuid( context, instance.uuid) pinned_az = req_spec.availability_zone except exception.RequestSpecNotFound: - pinned_az = '' + pinned_az = None return pinned_az def _get_scheduler_hints(self, context, instance, provided_sched_hints): @@ -543,15 +546,16 @@ def _list_view(self, func, request, servers, coll_name, show_extra_specs, :returns: Server data in dictionary format """ req_specs = None - req_specs_dict = collections.defaultdict(str) + req_specs_dict = {} sched_hints_dict = {} if api_version_request.is_supported(request, min_version='2.96'): context = request.environ['nova.context'] instance_uuids = [s.uuid for s in servers] req_specs = objects.RequestSpec.get_by_instance_uuids( context, instance_uuids) - req_specs_dict = {req.instance_uuid: req.availability_zone - for req in req_specs} + req_specs_dict.update({req.instance_uuid: req.availability_zone + for req in req_specs + if req.availability_zone is not None}) if api_version_request.is_supported(request, min_version='2.100'): sched_hints_dict.update({ req.instance_uuid: req.scheduler_hints @@ -565,7 +569,8 @@ def _list_view(self, func, request, servers, coll_name, show_extra_specs, show_host_status=show_host_status, show_sec_grp=show_sec_grp, bdms=bdms, cell_down_support=cell_down_support, - provided_az=req_specs_dict[server.uuid], + provided_az=req_specs_dict.get( + server.uuid, AZ_NOT_IN_REQUEST_SPEC), provided_sched_hints=sched_hints_dict.get( server.uuid, SCHED_HINTS_NOT_IN_REQUEST_SPEC) )["server"] diff --git a/nova/api/openstack/wsgi_app.py b/nova/api/openstack/wsgi_app.py index 6a2b72a6111..d6f1030f835 100644 --- a/nova/api/openstack/wsgi_app.py +++ b/nova/api/openstack/wsgi_app.py @@ -15,6 +15,7 @@ import sys from oslo_config import cfg +from oslo_db import exception as odbe from oslo_log import log as logging from oslo_reports import guru_meditation_report as gmr from oslo_reports import opts as gmr_opts @@ -116,6 +117,7 @@ def init_global_data(conf_files, service_name): logging.DEBUG) +@utils.latch_error_on_raise(retryable=(odbe.DBConnectionError,)) def init_application(name): conf_files = _get_config_files() diff --git a/nova/test.py b/nova/test.py index 0ddd928904c..e084f5c14f7 100644 --- a/nova/test.py +++ b/nova/test.py @@ -304,6 +304,7 @@ def setUp(self): # make sure that the wsgi app is fully initialized for all testcase # instead of only once initialized for test worker wsgi_app.init_global_data.reset() + wsgi_app.init_application.reset() # Reset the placement client singleton report.PLACEMENTCLIENT = None diff --git a/nova/tests/functional/regressions/test_bug_2098496.py b/nova/tests/functional/regressions/test_bug_2098496.py new file mode 100644 index 00000000000..731d9f441a4 --- /dev/null +++ b/nova/tests/functional/regressions/test_bug_2098496.py @@ -0,0 +1,181 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +import copy + +from nova import objects +from nova.tests.fixtures import libvirt as fakelibvirt +from nova.tests.functional.libvirt import test_pci_in_placement as base + + +class PCIInPlacementCreateAfterDeleteTestCase(base.PlacementPCIReportingTests): + def setUp(self): + super().setUp() + self.flags(group='filter_scheduler', pci_in_placement=True) + + def test_create_after_delete(self): + device_spec = self._to_list_of_json_str( + [ + { + "vendor_id": fakelibvirt.PCI_VEND_ID, + "product_id": fakelibvirt.VF_PROD_ID, + }, + ] + ) + pci_alias = { + "name": "a-vf", + "product_id": fakelibvirt.VF_PROD_ID, + "vendor_id": fakelibvirt.PCI_VEND_ID, + } + self._test_create_second_vm_after_first_is_deleted( + device_spec, pci_alias, self.VF_RC, traits=[]) + + def test_create_after_delete_with_rc_and_traits(self): + device_spec = self._to_list_of_json_str( + [ + { + "vendor_id": fakelibvirt.PCI_VEND_ID, + "product_id": fakelibvirt.VF_PROD_ID, + "resource_class": "vf", + "traits": "blue,big", + }, + ] + ) + pci_alias = { + "name": "a-vf", + "resource_class": "vf", + "traits": "big", + } + self._test_create_second_vm_after_first_is_deleted( + device_spec, pci_alias, + "CUSTOM_VF", traits=["CUSTOM_BLUE", "CUSTOM_BIG"]) + + def _test_create_second_vm_after_first_is_deleted( + self, device_spec, pci_alias, rc, traits + ): + # The fake libvirt will emulate on the host: + # * one PF with 2 VFs + pci_info = fakelibvirt.HostPCIDevicesInfo( + num_pci=0, num_pfs=1, num_vfs=2) + # make the VFs available + self.flags(group='pci', device_spec=device_spec) + self.start_compute(hostname="compute1", pci_info=pci_info) + + compute1_expected_placement_view = { + "inventories": { + "0000:81:00.0": {rc: 2}, + }, + "traits": {"0000:81:00.0": traits, + }, + "usages": { + "0000:81:00.0": {rc: 0}, + }, + "allocations": {}, + } + self.assert_placement_pci_view( + "compute1", **compute1_expected_placement_view) + self.assertPCIDeviceCounts('compute1', total=2, free=2) + # assert that the two VFs from the same PF are in the same pool + pools = objects.ComputeNode.get_first_node_by_host_for_old_compat( + self.ctxt, "compute1").pci_device_pools + self.assertEqual(len(pools), 1) + self.assertEqual(pools[0].count, 2) + compute1_empty = copy.deepcopy(compute1_expected_placement_view) + + self.flags( + group="pci", + alias=self._to_list_of_json_str([pci_alias]), + ) + extra_spec = {"pci_passthrough:alias": "a-vf:1"} + flavor_id = self._create_flavor(extra_spec=extra_spec) + server_1vf = self._create_server(flavor_id=flavor_id, networks=[]) + + self.assertPCIDeviceCounts('compute1', total=2, free=1) + compute1_expected_placement_view["usages"] = { + "0000:81:00.0": {rc: 1} + } + compute1_expected_placement_view["allocations"][server_1vf["id"]] = { + "0000:81:00.0": {rc: 1}, + } + self.assert_placement_pci_view( + "compute1", **compute1_expected_placement_view) + self.assert_no_pci_healing("compute1") + + self._delete_server(server_1vf) + self.assertPCIDeviceCounts('compute1', total=2, free=2) + self.assert_placement_pci_view("compute1", **compute1_empty) + + # This is already shows the potential issue behind bug/2098496. + # After the server is deleted we are not back to a single pool of 2 + # VFs but instead two separate pools with one device each but both + # related to the same rp_uuid. This breaks an assumption of the pool + # allocation logic later. + pools = objects.ComputeNode.get_first_node_by_host_for_old_compat( + self.ctxt, "compute1").pci_device_pools + self.assertEqual(len(pools), 2) + self.assertEqual(pools[0].count, 1) + self.assertEqual(pools[1].count, 1) + self.assertEqual(pools[0].tags['rp_uuid'], pools[1].tags['rp_uuid']) + + # This is the expected state after the bug is fixed + # assert that the single pool is not broken into two during the + # de-allocation + # pools = objects.ComputeNode.get_first_node_by_host_for_old_compat( + # self.ctxt, "compute1").pci_device_pools + # self.assertEqual(len(pools), 1) + # self.assertEqual(pools[0].count, 2) + + server_1vf = self._create_server(flavor_id=flavor_id, networks=[]) + # This is bug/2098496 as the VM now consumes 2 VFs instead of one it + # requested + self.assertPCIDeviceCounts('compute1', total=2, free=0) + compute1_expected_placement_view = copy.deepcopy(compute1_empty) + compute1_expected_placement_view["usages"] = { + "0000:81:00.0": {rc: 2} + } + compute1_expected_placement_view["allocations"][server_1vf["id"]] = { + "0000:81:00.0": {rc: 2}, + } + self.assert_placement_pci_view( + "compute1", **compute1_expected_placement_view) + # prove that the placement view got corrupted only after the claim + # on the compute due the allocation healing logic + last_healing = self.pci_healing_fixture.last_healing("compute1") + alloc_before = last_healing[0] + alloc_after = last_healing[1] + + def alloc_by_rc(allocations, rc): + return [ + alloc["resources"][rc] + for alloc in allocations.values() + if rc in alloc["resources"] + ] + + pci_alloc_before = alloc_by_rc( + alloc_before[server_1vf["id"]]["allocations"], rc) + self.assertEqual(1, sum(pci_alloc_before)) + + pci_alloc_after = alloc_by_rc( + alloc_after[server_1vf["id"]]["allocations"], rc) + self.assertEqual(2, sum(pci_alloc_after)) + + # This is the expected result after the bug is fixed + # self.assertPCIDeviceCounts('compute1', total=2, free=1) + # compute1_expected_placement_view = copy.deepcopy(compute1_empty) + # compute1_expected_placement_view["usages"] = { + # "0000:81:00.0": {rc: 1} + # } + # compute1_expected_placement_view["allocations"][server_1vf["id"]] = { + # "0000:81:00.0": {rc: 1}, + # } + # self.assert_placement_pci_view( + # "compute1", **compute1_expected_placement_view) + # self.assert_no_pci_healing("compute1") diff --git a/nova/tests/unit/api/openstack/compute/test_servers.py b/nova/tests/unit/api/openstack/compute/test_servers.py index cf165fde250..745bc6f891b 100644 --- a/nova/tests/unit/api/openstack/compute/test_servers.py +++ b/nova/tests/unit/api/openstack/compute/test_servers.py @@ -8466,8 +8466,74 @@ def test_list_view_with_missing_request_specs(self, m_rs): availability_zone=self.instance.availability_zone)] req = self.req('/%s/servers' % self.project_id) - self.assertRaises(KeyError, self.view_builder.index, - req, self.instances, False) + output = self.view_builder.index(req, self.instances, False) + + self.assertEqual(2, len(output['servers'])) + + @mock.patch('nova.objects.RequestSpec.get_by_instance_uuids') + def test_list_detail_view_with_missing_request_specs(self, m_rs): + + self.instances = [ + self.instance, + self.create_instance(2, uuids.fake1, 'fake-server'), + self.create_instance(3, uuids.fake2, 'fake-server2') + ] + # First instance's request spec has pinned availability zone + # Second instance's request spec has no pinned availability zone + m_rs.return_value = [ + objects.RequestSpec( + instance_uuid=self.instance.uuid, + availability_zone=self.instance.availability_zone), + objects.RequestSpec( + instance_uuid=self.instance.uuid, + availability_zone=None) + ] + + req = self.req('/%s/servers/detail' % self.project_id) + output = self.view_builder.detail(req, self.instances, False) + + self.assertEqual(3, len(output['servers'])) + # first instance has pinned az + self.assertEqual('nova', + output['servers'][0]['pinned_availability_zone']) + # second or later has no pinned az + for s in output['servers'][1:]: + self.assertIsNone(s['pinned_availability_zone']) + + @mock.patch('nova.objects.RequestSpec.get_by_instance_uuid') + def test_show_view_with_missing_request_specs(self, m_rs): + + self.instances = [ + self.instance, + self.create_instance(2, uuids.fake1, 'fake-server'), + self.create_instance(3, uuids.fake2, 'fake-server2') + ] + # First instance's request spec has pinned availability zone + # Second instance's request spec has no pinned availability zone + m_rs.side_effect = [ + objects.RequestSpec( + instance_uuid=self.instance.uuid, + availability_zone=self.instance.availability_zone), + objects.RequestSpec( + instance_uuid=self.instance.uuid, + availability_zone=None), + exception.RequestSpecNotFound(instance_uuid='3') + ] + + # Instance show with request spec and pinned az + req = self.req('/%s/servers/1' % self.project_id) + output = self.view_builder.show(req, self.instances[0]) + self.assertEqual('nova', output['server']['pinned_availability_zone']) + + # Instance show with request spec and no pinned az + req = self.req('/%s/servers/2' % self.project_id) + output = self.view_builder.show(req, self.instances[1]) + self.assertIsNone(output['server']['pinned_availability_zone']) + + # Instance show without request spec + req = self.req('/%s/servers/3' % self.project_id) + output = self.view_builder.show(req, self.instances[2]) + self.assertIsNone(output['server']['pinned_availability_zone']) class ServersViewBuilderTestV2100(_ServersViewBuilderTest): diff --git a/nova/tests/unit/api/openstack/test_wsgi_app.py b/nova/tests/unit/api/openstack/test_wsgi_app.py index 0eb7011c116..7fa1eacc623 100644 --- a/nova/tests/unit/api/openstack/test_wsgi_app.py +++ b/nova/tests/unit/api/openstack/test_wsgi_app.py @@ -82,6 +82,8 @@ def test_init_application_called_twice( # raised during it. self.assertRaises(test.TestingException, wsgi_app.init_application, 'nova-api') + # reset the latch_error_on_raise decorator + wsgi_app.init_application.reset() # Now run init_application a second time, it should succeed since no # exception is being raised (the init of global data should not be # re-attempted). @@ -89,6 +91,26 @@ def test_init_application_called_twice( self.assertIn('Global data already initialized, not re-initializing.', self.stdlog.logger.output) + @mock.patch( + 'sys.argv', new=mock.MagicMock(return_value=mock.sentinel.argv)) + @mock.patch('nova.api.openstack.wsgi_app._get_config_files') + def test_init_application_called_unrecoverable(self, mock_get_files): + """Test that init_application can tolerate being called more than once + in a single python interpreter instance and raises the same exception + forever if its unrecoverable. + """ + error = ValueError("unrecoverable config error") + excepted_type = type(error) + mock_get_files.side_effect = [ + error, test.TestingException, test.TestingException] + for i in range(3): + e = self.assertRaises( + excepted_type, wsgi_app.init_application, 'nova-api') + self.assertIs(e, error) + # since the expction is latched on the first raise mock_get_files + # should not be called again on each iteration + mock_get_files.assert_called_once() + @mock.patch('nova.objects.Service.get_by_host_and_binary') @mock.patch('nova.utils.raise_if_old_compute') def test_setup_service_version_workaround(self, mock_check_old, mock_get): diff --git a/nova/tests/unit/test_utils.py b/nova/tests/unit/test_utils.py index 1ee8ba93658..b34136b0522 100644 --- a/nova/tests/unit/test_utils.py +++ b/nova/tests/unit/test_utils.py @@ -1398,3 +1398,58 @@ def f(): self.assertRaises(ValueError, f.reset) self.assertFalse(f.called) mock_clean.assert_called_once_with() + + +class LatchErrorOnRaiseTests(test.NoDBTestCase): + + error = test.TestingException() + unrecoverable = ValueError('some error') + + @utils.latch_error_on_raise(retryable=(test.TestingException,)) + def dummy_test_func(self, error=None): + if error: + raise error + return True + + def setUp(self): + super().setUp() + self.dummy_test_func.reset() + + @mock.patch.object(utils.LOG, 'exception') + def test_wrapped_success(self, fake_logger): + self.assertTrue(self.dummy_test_func()) + fake_logger.assert_not_called() + self.assertIsNone(self.dummy_test_func.error) + + @mock.patch.object(utils.LOG, 'exception') + def test_wrapped_raises_recoverable(self, fake_logger): + expected = LatchErrorOnRaiseTests.error + e = self.assertRaises( + type(expected), self.dummy_test_func, error=expected) + self.assertIs(expected, e) + # we just leave recoverable exception flow though the decorator + # without catching them so the logger should not be called by the + # decorator + fake_logger.assert_not_called() + self.assertIsNone(self.dummy_test_func.error) + self.assertTrue(self.dummy_test_func()) + + @mock.patch.object(utils.LOG, 'exception') + def test_wrapped_raises_unrecoverable(self, fake_logger): + expected = LatchErrorOnRaiseTests.unrecoverable + e = self.assertRaises( + type(expected), self.dummy_test_func, error=expected) + self.assertIs(expected, e) + fake_logger.assert_called_once_with(expected) + self.assertIsNotNone(self.dummy_test_func.error) + self.assertIs(self.dummy_test_func.error, expected) + + @mock.patch.object(utils.LOG, 'exception', new=mock.MagicMock()) + def test_wrapped_raises_forever(self): + expected = LatchErrorOnRaiseTests.unrecoverable + first = self.assertRaises( + type(expected), self.dummy_test_func, error=expected) + self.assertIs(expected, first) + second = self.assertRaises( + type(expected), self.dummy_test_func, error=expected) + self.assertIs(first, second) diff --git a/nova/tests/unit/virt/ironic/test_driver.py b/nova/tests/unit/virt/ironic/test_driver.py index 137c53d1fed..e18aa4558d4 100644 --- a/nova/tests/unit/virt/ironic/test_driver.py +++ b/nova/tests/unit/virt/ironic/test_driver.py @@ -1582,15 +1582,18 @@ def test_spawn_node_driver_validation_fail(self, mock_avti, self.mock_conn.validate_node.return_value = \ ironic_utils.get_test_validation( - power=_node.ValidationResult(result=False, reason=None), + power=_node.ValidationResult(result=False, reason='OVERVOLT'), deploy=_node.ValidationResult(result=False, reason=None), - storage=_node.ValidationResult(result=False, reason=None), + storage=_node.ValidationResult(result=True, reason=None), ) self.mock_conn.get_node.return_value = node image_meta = ironic_utils.get_test_image_meta() - self.assertRaises(exception.ValidationError, self.driver.spawn, - self.ctx, instance, image_meta, [], None, {}) + msgre = '.*deploy: None, power: OVERVOLT, storage: No Error.*' + + self.assertRaisesRegex(exception.ValidationError, msgre, + self.driver.spawn, self.ctx, instance, image_meta, + [], None, {}) self.mock_conn.get_node.assert_called_once_with( node_id, fields=ironic_driver._NODE_FIELDS) mock_avti.assert_called_once_with(self.ctx, instance, None) diff --git a/nova/utils.py b/nova/utils.py index b6b37853311..a3b2353c5e0 100644 --- a/nova/utils.py +++ b/nova/utils.py @@ -1194,3 +1194,42 @@ def reset(wrapper, *args, **kwargs): wrapper.reset = functools.partial(reset, wrapper) return wrapper return outer_wrapper + + +class _SentinelException(Exception): + """This type exists to act as a placeholder and will never be raised""" + + +def latch_error_on_raise(retryable=(_SentinelException,)): + """This is a utility decorator to ensure if a function ever raises + it will always raise the same exception going forward. + + The only exception we know is safe to ignore is an oslo db connection + error as the db may be temporarily unavailable and we should allow + mod_wsgi to retry + """ + + def outer_wrapper(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + if wrapper.error: + raise wrapper.error + try: + return func(*args, **kwargs) + except retryable: + # reraise any retryable exception to allow them to be handled + # by the caller. + raise + except Exception as e: + wrapper.error = e + LOG.exception(e) + raise + + wrapper.error = None + + def reset(wrapper): + wrapper.error = None + + wrapper.reset = functools.partial(reset, wrapper) + return wrapper + return outer_wrapper diff --git a/nova/virt/ironic/driver.py b/nova/virt/ironic/driver.py index 3e58cf93a34..6da6c457ee5 100644 --- a/nova/virt/ironic/driver.py +++ b/nova/virt/ironic/driver.py @@ -1213,14 +1213,20 @@ def spawn(self, context, instance, image_meta, injected_files, ): # something is wrong. undo what we have done self._cleanup_deploy(node, instance, network_info) + deploy_msg = ("No Error" if validate_chk['deploy'].result + else validate_chk['deploy'].reason) + power_msg = ("No Error" if validate_chk['power'].result + else validate_chk['power'].reason) + storage_msg = ("No Error" if validate_chk['storage'].result + else validate_chk['storage'].reason) raise exception.ValidationError(_( "Ironic node: %(id)s failed to validate. " "(deploy: %(deploy)s, power: %(power)s, " "storage: %(storage)s)") % {'id': node.id, - 'deploy': validate_chk['deploy'], - 'power': validate_chk['power'], - 'storage': validate_chk['storage']}) + 'deploy': deploy_msg, + 'power': power_msg, + 'storage': storage_msg}) # Config drive configdrive_value = None diff --git a/releasenotes/notes/bug-2095364-ffbf67c0ae3f53b5.yaml b/releasenotes/notes/bug-2095364-ffbf67c0ae3f53b5.yaml new file mode 100644 index 00000000000..672863666d5 --- /dev/null +++ b/releasenotes/notes/bug-2095364-ffbf67c0ae3f53b5.yaml @@ -0,0 +1,15 @@ +--- +fixes: + - | + `Bug #2095364`_: Fixed the List Server API and the List Server Detail API + 500 Internal Server Error issue in v2.96 or later API microversion if + one or more instance has no request spec object. One usecase was when cloud + user tried to create instance which exceeded their quota, the request does + not create instance request spec. Once the no request spec instance is + created in cloud user project, the server list API and the list server + details API return 500 Internal Server Error for the project until the + cloud user deletes the no request spec object instance. + After this fix, the v2.96 or later returns `null` at the + `pinned_availability_zone` value if not specified. + + .. _Bug #2095364: https://launchpad.net/bugs/2095364 diff --git a/releasenotes/notes/ironic-validate-node-message-6a8b1eedbddd06fd.yaml b/releasenotes/notes/ironic-validate-node-message-6a8b1eedbddd06fd.yaml new file mode 100644 index 00000000000..2ac2e894011 --- /dev/null +++ b/releasenotes/notes/ironic-validate-node-message-6a8b1eedbddd06fd.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - | + Fix displaying the reason messages from the Ironic validate node operation that + is called just before the instance is deployed on the bare metal node. The + message from Ironic is now correctly logged. + Fixes `bug 2100009 _`. diff --git a/releasenotes/notes/latch-error-on-raise-cf2da71a12b5f55f.yaml b/releasenotes/notes/latch-error-on-raise-cf2da71a12b5f55f.yaml new file mode 100644 index 00000000000..3c364b8b99a --- /dev/null +++ b/releasenotes/notes/latch-error-on-raise-cf2da71a12b5f55f.yaml @@ -0,0 +1,9 @@ +--- +fixes: + - | + The nova (metadata)api wsgi application will now detect fatal errors + (configuration, et al) on startup and lock into a permanent error state + until fixed and restarted. This solves a problem with some wsgi runtimes + ignoring initialization errors and continuing to send requests to the + half-initialized service. See https://bugs.launchpad.net/nova/+bug/2103811 + for more details. diff --git a/tox.ini b/tox.ini index d402501920f..9026f14ab1d 100644 --- a/tox.ini +++ b/tox.ini @@ -10,7 +10,7 @@ allowlist_externals = rm env make -install_command = python -I -m pip install -c{env:TOX_CONSTRAINTS_FILE:https://releases.openstack.org/constraints/upper/master} {opts} {packages} +install_command = python -I -m pip install -c{env:TOX_CONSTRAINTS_FILE:https://releases.openstack.org/constraints/upper/2025.1} {opts} {packages} setenv = VIRTUAL_ENV={envdir} LANGUAGE=en_US