From c38b1dbe031795f842440ebd70d89debe38256ad Mon Sep 17 00:00:00 2001 From: OpenStack Release Bot Date: Fri, 16 Sep 2022 10:01:57 +0000 Subject: [PATCH 01/73] [stable-only] Update .gitreview for stable/zed Change-Id: Ia232e95b1b0cb0281990cf326764951ef8c1b678 --- .gitreview | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitreview b/.gitreview index c2b7eef7078..bc0f313915b 100644 --- a/.gitreview +++ b/.gitreview @@ -2,3 +2,4 @@ host=review.opendev.org port=29418 project=openstack/nova.git +defaultbranch=stable/zed From 839502e454abf76a1d66def397741a3a924431de Mon Sep 17 00:00:00 2001 From: OpenStack Release Bot Date: Fri, 16 Sep 2022 10:02:05 +0000 Subject: [PATCH 02/73] [stable-only] Update TOX_CONSTRAINTS_FILE for stable/zed Update the URL to the upper-constraints file to point to the redirect rule on releases.openstack.org so that anyone working on this branch will switch to the correct upper-constraints list automatically when the requirements repository branches. Until the requirements repository has as stable/zed branch, tests will continue to use the upper-constraints list on master. Change-Id: I9dad37b404c41f6837bff49c8dcb0d9d254d37f7 --- tox.ini | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tox.ini b/tox.ini index edb08599e7d..0f798c2fde4 100644 --- a/tox.ini +++ b/tox.ini @@ -26,7 +26,7 @@ setenv = # TODO(stephenfin): Remove once we bump our upper-constraint to SQLAlchemy 2.0 SQLALCHEMY_WARN_20=1 deps = - -c{env:TOX_CONSTRAINTS_FILE:https://releases.openstack.org/constraints/upper/master} + -c{env:TOX_CONSTRAINTS_FILE:https://releases.openstack.org/constraints/upper/zed} -r{toxinidir}/requirements.txt -r{toxinidir}/test-requirements.txt extras = @@ -200,7 +200,7 @@ description = # Note that we don't use {[testenv]deps} for deps here because we don't want # to install (test-)requirements.txt for docs. deps = - -c{env:TOX_CONSTRAINTS_FILE:https://releases.openstack.org/constraints/upper/master} + -c{env:TOX_CONSTRAINTS_FILE:https://releases.openstack.org/constraints/upper/zed} -r{toxinidir}/requirements.txt -r{toxinidir}/doc/requirements.txt extras = From 2db7cbf7d0a6692a885968bec67f48e7262ceec3 Mon Sep 17 00:00:00 2001 From: Thomas Goirand Date: Sat, 17 Sep 2022 12:56:03 +0200 Subject: [PATCH 03/73] requires os-traits >= 2.9.0 Without the latest version, nova fails many unit tests (it failed with os-traits 2.7.0 at least). Closes-Bug: #1990121 Change-Id: I6b320ae1f9058aaa5bac91c7c7ca60136e0cee5c (cherry picked from commit 6a06a57290b6f38b6a7c97e47017127472834d1f) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 9d523ebd7d3..c38ade020dc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -53,7 +53,7 @@ psutil>=3.2.2 # BSD oslo.versionedobjects>=1.35.0 # Apache-2.0 os-brick>=5.2 # Apache-2.0 os-resource-classes>=1.1.0 # Apache-2.0 -os-traits>=2.7.0 # Apache-2.0 +os-traits>=2.9.0 # Apache-2.0 os-vif>=1.15.2 # Apache-2.0 castellan>=0.16.0 # Apache-2.0 microversion-parse>=0.2.1 # Apache-2.0 From c3489ed5cc21a9fa968949e04f1c7762f09b5606 Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Fri, 16 Sep 2022 10:39:42 +0200 Subject: [PATCH 04/73] Remove mentions of removed scheduler filters Change-Id: I1348cca8cbd8b1142dab8507c8aa1b9baf01e73c (cherry picked from commit 4fb4f6832c156907b786571f214984894703bf16) --- doc/source/contributor/development-environment.rst | 2 +- nova/conf/compute.py | 12 +++--------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/doc/source/contributor/development-environment.rst b/doc/source/contributor/development-environment.rst index 32b8f8334e0..3e19ef1ca23 100644 --- a/doc/source/contributor/development-environment.rst +++ b/doc/source/contributor/development-environment.rst @@ -197,7 +197,7 @@ Using fake computes for tests The number of instances supported by fake computes is not limited by physical constraints. It allows you to perform stress tests on a deployment with few resources (typically a laptop). Take care to avoid using scheduler filters -that will limit the number of instances per compute, such as ``AggregateCoreFilter``. +that will limit the number of instances per compute, such as ``NumInstancesFilter``. Fake computes can also be used in multi hypervisor-type deployments in order to take advantage of fake and "real" computes during tests: diff --git a/nova/conf/compute.py b/nova/conf/compute.py index 004dbb83b6d..71707cfd531 100644 --- a/nova/conf/compute.py +++ b/nova/conf/compute.py @@ -440,9 +440,7 @@ Virtual CPU to physical CPU allocation ratio. This option is used to influence the hosts selected by the Placement API by -configuring the allocation ratio for ``VCPU`` inventory. In addition, the -``AggregateCoreFilter`` (deprecated) will fall back to this configuration value -if no per-aggregate setting is found. +configuring the allocation ratio for ``VCPU`` inventory. .. note:: @@ -473,9 +471,7 @@ Virtual RAM to physical RAM allocation ratio. This option is used to influence the hosts selected by the Placement API by -configuring the allocation ratio for ``MEMORY_MB`` inventory. In addition, the -``AggregateRamFilter`` (deprecated) will fall back to this configuration value -if no per-aggregate setting is found. +configuring the allocation ratio for ``MEMORY_MB`` inventory. .. note:: @@ -501,9 +497,7 @@ Virtual disk to physical disk allocation ratio. This option is used to influence the hosts selected by the Placement API by -configuring the allocation ratio for ``DISK_GB`` inventory. In addition, the -``AggregateDiskFilter`` (deprecated) will fall back to this configuration value -if no per-aggregate setting is found. +configuring the allocation ratio for ``DISK_GB`` inventory. When configured, a ratio greater than 1.0 will result in over-subscription of the available physical disk, which can be useful for more efficiently packing From 00396fa9396324780c09161ed57a86b7e458c26f Mon Sep 17 00:00:00 2001 From: Brett Milford Date: Thu, 4 Aug 2022 16:52:33 +1000 Subject: [PATCH 05/73] Handle "no RAM info was set" migration case This handles the case where the live migration monitoring thread may race and call jobStats() after the migration has completed resulting in the following error: libvirt.libvirtError: internal error: migration was active, but no RAM info was set Closes-Bug: #1982284 Change-Id: I77fdfa9cffbd44b2889f49f266b2582bcc6a4267 (cherry picked from commit 9fea934c71d3c2fa7fdd80c67d94e18466c5cf9a) --- nova/tests/unit/virt/libvirt/test_guest.py | 22 +++++++++++++++++++ nova/virt/libvirt/guest.py | 7 ++++++ ...-no-ram-info-was-set-99784934ed80fd72.yaml | 11 ++++++++++ 3 files changed, 40 insertions(+) create mode 100644 releasenotes/notes/bug-1982284-libvirt-handle-no-ram-info-was-set-99784934ed80fd72.yaml diff --git a/nova/tests/unit/virt/libvirt/test_guest.py b/nova/tests/unit/virt/libvirt/test_guest.py index 47ee2289436..5b181b8f067 100644 --- a/nova/tests/unit/virt/libvirt/test_guest.py +++ b/nova/tests/unit/virt/libvirt/test_guest.py @@ -1053,3 +1053,25 @@ def test_job_info_operation_invalid(self, mock_stats, mock_info): mock_stats.assert_called_once_with() mock_info.assert_called_once_with() + + @mock.patch.object(fakelibvirt.virDomain, "jobInfo") + @mock.patch.object(fakelibvirt.virDomain, "jobStats") + def test_job_stats_no_ram(self, mock_stats, mock_info): + mock_stats.side_effect = fakelibvirt.make_libvirtError( + fakelibvirt.libvirtError, + "internal error: migration was active, but no RAM info was set", + error_code=fakelibvirt.VIR_ERR_INTERNAL_ERROR, + error_message="migration was active, but no RAM info was set") + + info = self.guest.get_job_info() + + self.assertIsInstance(info, libvirt_guest.JobInfo) + self.assertEqual(fakelibvirt.VIR_DOMAIN_JOB_NONE, info.type) + self.assertEqual(0, info.time_elapsed) + self.assertEqual(0, info.time_remaining) + self.assertEqual(0, info.memory_total) + self.assertEqual(0, info.memory_processed) + self.assertEqual(0, info.memory_remaining) + + mock_stats.assert_called_once_with() + self.assertFalse(mock_info.called) diff --git a/nova/virt/libvirt/guest.py b/nova/virt/libvirt/guest.py index 4c6fd160a86..c40c3c4a7f1 100644 --- a/nova/virt/libvirt/guest.py +++ b/nova/virt/libvirt/guest.py @@ -674,6 +674,7 @@ def get_job_info(self): stats = self._domain.jobStats() return JobInfo(**stats) except libvirt.libvirtError as ex: + errmsg = ex.get_error_message() if ex.get_error_code() == libvirt.VIR_ERR_NO_SUPPORT: # Remote libvirt doesn't support new API LOG.debug("Missing remote virDomainGetJobStats: %s", ex) @@ -686,6 +687,12 @@ def get_job_info(self): # away completclsely LOG.debug("Domain has shutdown/gone away: %s", ex) return JobInfo(type=libvirt.VIR_DOMAIN_JOB_COMPLETED) + elif (ex.get_error_code() == libvirt.VIR_ERR_INTERNAL_ERROR and + errmsg and "migration was active, " + "but no RAM info was set" in errmsg): + LOG.debug("Migration is active or completed but " + "virDomainGetJobStats is missing ram: %s", ex) + return JobInfo(type=libvirt.VIR_DOMAIN_JOB_NONE) else: LOG.debug("Failed to get job stats: %s", ex) raise diff --git a/releasenotes/notes/bug-1982284-libvirt-handle-no-ram-info-was-set-99784934ed80fd72.yaml b/releasenotes/notes/bug-1982284-libvirt-handle-no-ram-info-was-set-99784934ed80fd72.yaml new file mode 100644 index 00000000000..943aa99a436 --- /dev/null +++ b/releasenotes/notes/bug-1982284-libvirt-handle-no-ram-info-was-set-99784934ed80fd72.yaml @@ -0,0 +1,11 @@ +--- +other: + - | + A workaround has been added to the libvirt driver to catch and pass + migrations that were previously failing with the error: + + ``libvirt.libvirtError: internal error: migration was active, but no RAM info was set`` + + See `bug 1982284`_ for more details. + + .. _bug 1982284: https://bugs.launchpad.net/nova/+bug/1982284 From 74a618a8118642c9fd32c4e0d502d12ac826affe Mon Sep 17 00:00:00 2001 From: Amit Uniyal Date: Thu, 25 Aug 2022 05:08:44 +0000 Subject: [PATCH 06/73] Adds a repoducer for post live migration fail Adds a regression test or repoducer for post live migration fail at destination, the possible casue can be fail to get instance network info or block device info changes: adds updating server after _live_migrate in reproducer test (missed in main commit) Related-Bug: #1628606 Change-Id: I48dbe0aae8a3943fdde69cda1bd663d70ea0eb19 (cherry picked from commit a20baeca1f5ebb0dfe9607335a6986e9ed0e1725) --- .../regressions/test_bug_1628606.py | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 nova/tests/functional/regressions/test_bug_1628606.py diff --git a/nova/tests/functional/regressions/test_bug_1628606.py b/nova/tests/functional/regressions/test_bug_1628606.py new file mode 100644 index 00000000000..995c552cf9a --- /dev/null +++ b/nova/tests/functional/regressions/test_bug_1628606.py @@ -0,0 +1,61 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from nova import test +from nova.tests import fixtures as nova_fixtures +from nova.tests.functional.api import client +from nova.tests.functional import fixtures as func_fixtures +from nova.tests.functional import integrated_helpers +from unittest import mock + + +class PostLiveMigrationFail( + test.TestCase, integrated_helpers.InstanceHelperMixin): + """Regression test for bug 1628606 + """ + + def setUp(self): + super().setUp() + self.useFixture(nova_fixtures.NeutronFixture(self)) + self.glance = self.useFixture(nova_fixtures.GlanceFixture(self)) + self.useFixture(func_fixtures.PlacementFixture()) + self.useFixture(nova_fixtures.HostNameWeigherFixture()) + + self.start_service('conductor') + self.start_service('scheduler') + + api_fixture = self.useFixture(nova_fixtures.OSAPIFixture( + api_version='v2.1')) + + self.api = api_fixture.admin_api + self.api.microversion = 'latest' + + self.src = self._start_compute(host='host1') + self.dest = self._start_compute(host='host2') + + @mock.patch( + 'nova.compute.manager.ComputeManager' + '._post_live_migration_remove_source_vol_connections') + def test_post_live_migration(self, mock_migration): + server = self._create_server(networks=[]) + self.assertEqual(self.src.host, server['OS-EXT-SRV-ATTR:host']) + + error = client.OpenStackApiException( + "Failed to remove source vol connection post live migration") + mock_migration.side_effect = error + + server = self._live_migrate( + server, migration_expected_state='error', + server_expected_state='ERROR') + # FIXME(amit): this should point to the dest as after migration + # but does not because of bug 1628606 + self.assertEqual(self.src.host, server['OS-EXT-SRV-ATTR:host']) From 643b0c7d35752b214eee19b8d7298a19a8493f6b Mon Sep 17 00:00:00 2001 From: Sean Mooney Date: Thu, 13 May 2021 12:48:21 +0100 Subject: [PATCH 07/73] [compute] always set instance.host in post_livemigration This change add a new _post_live_migration_update_host function that wraps _post_live_migration and just ensures that if we exit due to an exception instance.host is set to the destination host. when we are in _post_live_migration the guest has already started running on the destination host and we cannot revert. Sometimes admins or users will hard reboot the instance expecting that to fix everything when the vm enters the error state after the failed migrations. Previously this would end up recreating the instance on the source node leading to possible data corruption if the instance used shared storage. Change-Id: Ibc4bc7edf1c8d1e841c72c9188a0a62836e9f153 Partial-Bug: #1628606 (cherry picked from commit 8449b7caefa4a5c0728e11380a088525f15ad6f5) --- nova/compute/manager.py | 43 +++++++++++++++++-- .../regressions/test_bug_1628606.py | 5 +-- nova/tests/unit/compute/test_compute_mgr.py | 21 +++++++++ 3 files changed, 63 insertions(+), 6 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 667455215c8..f25d037c504 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -8761,8 +8761,9 @@ def _do_live_migration(self, context, dest, instance, block_migration, # host attachment. We fetch BDMs before that to retain connection_info # and attachment_id relating to the source host for post migration # cleanup. - post_live_migration = functools.partial(self._post_live_migration, - source_bdms=source_bdms) + post_live_migration = functools.partial( + self._post_live_migration_update_host, source_bdms=source_bdms + ) rollback_live_migration = functools.partial( self._rollback_live_migration, source_bdms=source_bdms) @@ -9037,6 +9038,42 @@ def _post_live_migration_remove_source_vol_connections( bdm.attachment_id, self.host, str(e), instance=instance) + # TODO(sean-k-mooney): add typing + def _post_live_migration_update_host( + self, ctxt, instance, dest, block_migration=False, + migrate_data=None, source_bdms=None + ): + try: + self._post_live_migration( + ctxt, instance, dest, block_migration, migrate_data, + source_bdms) + except Exception: + # Restore the instance object + node_name = None + try: + # get node name of compute, where instance will be + # running after migration, that is destination host + compute_node = self._get_compute_info(ctxt, dest) + node_name = compute_node.hypervisor_hostname + except exception.ComputeHostNotFound: + LOG.exception('Failed to get compute_info for %s', dest) + + # we can never rollback from post live migration and we can only + # get here if the instance is running on the dest so we ensure + # the instance.host is set correctly and reraise the original + # exception unmodified. + if instance.host != dest: + # apply saves the new fields while drop actually removes the + # migration context from the instance, so migration persists. + instance.apply_migration_context() + instance.drop_migration_context() + instance.host = dest + instance.task_state = None + instance.node = node_name + instance.progress = 0 + instance.save() + raise + @wrap_exception() @wrap_instance_fault def _post_live_migration(self, ctxt, instance, dest, @@ -9048,7 +9085,7 @@ def _post_live_migration(self, ctxt, instance, dest, and mainly updating database record. :param ctxt: security context - :param instance: instance dict + :param instance: instance object :param dest: destination host :param block_migration: if true, prepare for block migration :param migrate_data: if not None, it is a dict which has data diff --git a/nova/tests/functional/regressions/test_bug_1628606.py b/nova/tests/functional/regressions/test_bug_1628606.py index 995c552cf9a..0fccd78ccec 100644 --- a/nova/tests/functional/regressions/test_bug_1628606.py +++ b/nova/tests/functional/regressions/test_bug_1628606.py @@ -56,6 +56,5 @@ def test_post_live_migration(self, mock_migration): server = self._live_migrate( server, migration_expected_state='error', server_expected_state='ERROR') - # FIXME(amit): this should point to the dest as after migration - # but does not because of bug 1628606 - self.assertEqual(self.src.host, server['OS-EXT-SRV-ATTR:host']) + + self.assertEqual(self.dest.host, server['OS-EXT-SRV-ATTR:host']) diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index b8eef08d600..1a4935f482a 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -10209,6 +10209,27 @@ def test_post_live_migration_new_allocations(self): self.instance, migration) + def test_post_live_migration_update_host(self): + @mock.patch.object(self.compute, '_get_compute_info') + def _test_post_live_migration(_get_compute_info): + dest_host = 'dest' + cn = objects.ComputeNode(hypervisor_hostname=dest_host) + _get_compute_info.return_value = cn + instance = fake_instance.fake_instance_obj(self.context, + node='src', + uuid=uuids.instance) + with mock.patch.object(self.compute, "_post_live_migration" + ) as plm, mock.patch.object(instance, "save") as save: + error = ValueError("some failure") + plm.side_effect = error + self.assertRaises( + ValueError, self.compute._post_live_migration_update_host, + self.context, instance, dest_host) + save.assert_called_once() + self.assertEqual(instance.host, dest_host) + + _test_post_live_migration() + def test_post_live_migration_cinder_pre_344_api(self): # Because live migration has # succeeded,_post_live_migration_remove_source_vol_connections() From 857df72d3166a8f7e8a8cdfeabb62ad6ead46565 Mon Sep 17 00:00:00 2001 From: Sylvain Bauza Date: Thu, 21 Jul 2022 18:21:51 +0200 Subject: [PATCH 08/73] Reproducer for bug 1951656 Due to a new mdev naming, we can't parse it. Change-Id: I0f785178b132dfef668829558dea9f7e674abadb Related-Bug: #1951656 (cherry picked from commit 185201974775bab966f4e5ca3bbdc31b8269fa4c) --- .../regressions/test_bug_1951656.py | 83 +++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 nova/tests/functional/regressions/test_bug_1951656.py diff --git a/nova/tests/functional/regressions/test_bug_1951656.py b/nova/tests/functional/regressions/test_bug_1951656.py new file mode 100644 index 00000000000..9aad191072c --- /dev/null +++ b/nova/tests/functional/regressions/test_bug_1951656.py @@ -0,0 +1,83 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from oslo_utils import uuidutils + + +from nova.tests.fixtures import libvirt as fakelibvirt +from nova.tests.functional.libvirt import test_vgpu +from nova.virt.libvirt import utils as libvirt_utils + + +class VGPUTestsLibvirt7_7(test_vgpu.VGPUTestBase): + + def _create_mdev(self, physical_device, mdev_type, uuid=None): + # We need to fake the newly created sysfs object by adding a new + # FakeMdevDevice in the existing persisted Connection object so + # when asking to get the existing mdevs, we would see it. + if not uuid: + uuid = uuidutils.generate_uuid() + mdev_name = libvirt_utils.mdev_uuid2name(uuid) + libvirt_parent = self.pci2libvirt_address(physical_device) + + # Libvirt 7.7 now creates mdevs with a parent_addr suffix. + new_mdev_name = '_'.join([mdev_name, libvirt_parent]) + + # Here, we get the right compute thanks by the self.current_host that + # was modified just before + connection = self.computes[ + self._current_host].driver._host.get_connection() + connection.mdev_info.devices.update( + {mdev_name: fakelibvirt.FakeMdevDevice(dev_name=new_mdev_name, + type_id=mdev_type, + parent=libvirt_parent)}) + return uuid + + def setUp(self): + super(VGPUTestsLibvirt7_7, self).setUp() + extra_spec = {"resources:VGPU": "1"} + self.flavor = self._create_flavor(extra_spec=extra_spec) + + # Start compute1 supporting only nvidia-11 + self.flags( + enabled_mdev_types=fakelibvirt.NVIDIA_11_VGPU_TYPE, + group='devices') + + self.compute1 = self.start_compute_with_vgpu('host1') + + def test_create_servers_with_vgpu(self): + + # Create a single instance against a specific compute node. + self._create_server( + image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6', + flavor_id=self.flavor, host=self.compute1.host, + networks='auto', expected_state='ACTIVE') + + # TODO(sbauza): Modify this once bug #1851656 is fixed. + # mdev_name2uuid() raises a badly formed hexadecimal UUID string error + self.assertRaises(ValueError, + self.assert_mdev_usage, + self.compute1, expected_amount=1) + + # Now, the problem is that we can't create new instances with VGPUs + # from this host. + server = self._create_server( + image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6', + flavor_id=self.flavor, host=self.compute1.host, + networks='auto', expected_state='ERROR') + # The error is due to a bad mdev name parsing + self.assertIn('fault', server) + # since we only have one host, we have a RescheduledException as this + # service was creating an exception and we can't use another one. + self.assertIn('Exceeded maximum number of retries', + server['fault']['message']) From 98d8c9eaa3c415cc234193e6a9115db887751363 Mon Sep 17 00:00:00 2001 From: Billy Olsen Date: Thu, 21 Apr 2022 19:42:27 -0700 Subject: [PATCH 09/73] Handle mdev devices in libvirt 7.7+ Libvirt 7.7 changed the mdev device naming to include the parent PCI device when listing node devices. The domain, however, will still only see the UUID and not see the parent PCI device. Changing the parsing to simply drop the PCI identifier is not enough as the device cannot be found when attempting to lookup the new ID. Modify the Libvirt Driver's _get_mediated_device_information to tolerate different formats of the mdev name. This first uses the legacy behavior by trying to lookup the device name that is passed in (typically mdev_ format) and if that is not found, iterates the list of mdev node devices until the right UUID is found and selects that one. Note that the lookup of the mdev device by UUID are needed in order to keep the ability to recreate assigned mediated devices on a reboot of the compute node. Additionally, the libvirt utils parsing method mdev_name2uuid, has been updated to tolerate both mdev_ and mdev__ formats. Closes-Bug: 1951656 Change-Id: Ifed0fa16053228990a6a8df8d4c666521db7e329 (cherry picked from commit a28b907c4f0dbba6e141a8fbea807e6cb0438977) --- .../regressions/test_bug_1951656.py | 22 +++------- nova/tests/unit/virt/libvirt/test_config.py | 26 +++++++++++ nova/virt/libvirt/config.py | 3 ++ nova/virt/libvirt/driver.py | 43 +++++++++++++++++-- nova/virt/libvirt/host.py | 2 +- nova/virt/libvirt/utils.py | 28 +++++++++--- 6 files changed, 97 insertions(+), 27 deletions(-) diff --git a/nova/tests/functional/regressions/test_bug_1951656.py b/nova/tests/functional/regressions/test_bug_1951656.py index 9aad191072c..d705ff6fe31 100644 --- a/nova/tests/functional/regressions/test_bug_1951656.py +++ b/nova/tests/functional/regressions/test_bug_1951656.py @@ -63,21 +63,11 @@ def test_create_servers_with_vgpu(self): flavor_id=self.flavor, host=self.compute1.host, networks='auto', expected_state='ACTIVE') - # TODO(sbauza): Modify this once bug #1851656 is fixed. - # mdev_name2uuid() raises a badly formed hexadecimal UUID string error - self.assertRaises(ValueError, - self.assert_mdev_usage, - self.compute1, expected_amount=1) - - # Now, the problem is that we can't create new instances with VGPUs - # from this host. - server = self._create_server( + self.assert_mdev_usage(self.compute1, expected_amount=1) + + self._create_server( image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6', flavor_id=self.flavor, host=self.compute1.host, - networks='auto', expected_state='ERROR') - # The error is due to a bad mdev name parsing - self.assertIn('fault', server) - # since we only have one host, we have a RescheduledException as this - # service was creating an exception and we can't use another one. - self.assertIn('Exceeded maximum number of retries', - server['fault']['message']) + networks='auto', expected_state='ACTIVE') + + self.assert_mdev_usage(self.compute1, expected_amount=2) diff --git a/nova/tests/unit/virt/libvirt/test_config.py b/nova/tests/unit/virt/libvirt/test_config.py index c4c9359dd83..4a8aa027a91 100644 --- a/nova/tests/unit/virt/libvirt/test_config.py +++ b/nova/tests/unit/virt/libvirt/test_config.py @@ -3181,6 +3181,32 @@ def test_config_mdev_device(self): config.LibvirtConfigNodeDeviceMdevInformation) self.assertEqual("nvidia-11", obj.mdev_information.type) self.assertEqual(12, obj.mdev_information.iommu_group) + self.assertIsNone(obj.mdev_information.uuid) + + def test_config_mdev_device_uuid(self): + xmlin = """ + + mdev_b2107403_110c_45b0_af87_32cc91597b8a_0000_41_00_0 + /sys/devices/pci0000:40/0000:40:03.1/0000:41:00.0/b2107403-110c-45b0-af87-32cc91597b8a + pci_0000_41_00_0 + + vfio_mdev + + + + b2107403-110c-45b0-af87-32cc91597b8a + + + """ + + obj = config.LibvirtConfigNodeDevice() + obj.parse_str(xmlin) + self.assertIsInstance(obj.mdev_information, + config.LibvirtConfigNodeDeviceMdevInformation) + self.assertEqual("nvidia-442", obj.mdev_information.type) + self.assertEqual(57, obj.mdev_information.iommu_group) + self.assertEqual("b2107403-110c-45b0-af87-32cc91597b8a", + obj.mdev_information.uuid) def test_config_vdpa_device(self): xmlin = """ diff --git a/nova/virt/libvirt/config.py b/nova/virt/libvirt/config.py index 3d91c325c39..0db2dc6b679 100644 --- a/nova/virt/libvirt/config.py +++ b/nova/virt/libvirt/config.py @@ -3382,6 +3382,7 @@ def __init__(self, **kwargs): root_name="capability", **kwargs) self.type = None self.iommu_group = None + self.uuid = None def parse_dom(self, xmldoc): super(LibvirtConfigNodeDeviceMdevInformation, @@ -3391,6 +3392,8 @@ def parse_dom(self, xmldoc): self.type = c.get('id') if c.tag == "iommuGroup": self.iommu_group = int(c.get('number')) + if c.tag == "uuid": + self.uuid = c.text class LibvirtConfigNodeDeviceVpdCap(LibvirtConfigObject): diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index ce884dfe306..b2f0fd08625 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -8227,15 +8227,52 @@ def _get_mdev_capable_devices(self, types=None): def _get_mediated_device_information(self, devname): """Returns a dict of a mediated device.""" - virtdev = self._host.device_lookup_by_name(devname) + # LP #1951656 - In Libvirt 7.7, the mdev name now includes the PCI + # address of the parent device (e.g. mdev__) due to + # the mdevctl allowing for multiple mediated devs having the same UUID + # defined (only one can be active at a time). Since the guest + # information doesn't have the parent ID, try to lookup which + # mediated device is available that matches the UUID. If multiple + # devices are found that match the UUID, then this is an error + # condition. + try: + virtdev = self._host.device_lookup_by_name(devname) + except libvirt.libvirtError as ex: + if ex.get_error_code() != libvirt.VIR_ERR_NO_NODE_DEVICE: + raise + mdevs = [dev for dev in self._host.list_mediated_devices() + if dev.startswith(devname)] + # If no matching devices are found, simply raise the original + # exception indicating that no devices are found. + if not mdevs: + raise + elif len(mdevs) > 1: + msg = ("The mediated device name %(devname)s refers to a UUID " + "that is present in multiple libvirt mediated devices. " + "Matching libvirt mediated devices are %(devices)s. " + "Mediated device UUIDs must be unique for Nova." % + {'devname': devname, + 'devices': ', '.join(mdevs)}) + raise exception.InvalidLibvirtMdevConfig(reason=msg) + + LOG.debug('Found requested device %s as %s. Using that.', + devname, mdevs[0]) + virtdev = self._host.device_lookup_by_name(mdevs[0]) xmlstr = virtdev.XMLDesc(0) cfgdev = vconfig.LibvirtConfigNodeDevice() cfgdev.parse_str(xmlstr) + # Starting with Libvirt 7.3, the uuid information is available in the + # node device information. If its there, use that. Otherwise, + # fall back to the previous behavior of parsing the uuid from the + # devname. + if cfgdev.mdev_information.uuid: + mdev_uuid = cfgdev.mdev_information.uuid + else: + mdev_uuid = libvirt_utils.mdev_name2uuid(cfgdev.name) device = { "dev_id": cfgdev.name, - # name is like mdev_00ead764_fdc0_46b6_8db9_2963f5c815b4 - "uuid": libvirt_utils.mdev_name2uuid(cfgdev.name), + "uuid": mdev_uuid, # the physical GPU PCI device "parent": cfgdev.parent, "type": cfgdev.mdev_information.type, diff --git a/nova/virt/libvirt/host.py b/nova/virt/libvirt/host.py index 785acdcd186..46435a9a7fd 100644 --- a/nova/virt/libvirt/host.py +++ b/nova/virt/libvirt/host.py @@ -1566,7 +1566,7 @@ def list_mdev_capable_devices(self, flags=0): def list_mediated_devices(self, flags=0): """Lookup mediated devices. - :returns: a list of virNodeDevice instance + :returns: a list of strings with the name of the instance """ return self._list_devices("mdev", flags=flags) diff --git a/nova/virt/libvirt/utils.py b/nova/virt/libvirt/utils.py index c673818603c..0675e4ac146 100644 --- a/nova/virt/libvirt/utils.py +++ b/nova/virt/libvirt/utils.py @@ -575,17 +575,31 @@ def get_default_machine_type(arch: str) -> ty.Optional[str]: def mdev_name2uuid(mdev_name: str) -> str: - """Convert an mdev name (of the form mdev_) to a - uuid (of the form 8-4-4-4-12). + """Convert an mdev name (of the form mdev_ or + mdev__) to a uuid + (of the form 8-4-4-4-12). + + :param mdev_name: the name of the mdev to parse the UUID from + :returns: string containing the uuid """ - return str(uuid.UUID(mdev_name[5:].replace('_', '-'))) + mdev_uuid = mdev_name[5:].replace('_', '-') + # Unconditionnally remove the PCI address from the name + mdev_uuid = mdev_uuid[:36] + return str(uuid.UUID(mdev_uuid)) + +def mdev_uuid2name(mdev_uuid: str, parent: str = None) -> str: + """Convert an mdev uuid (of the form 8-4-4-4-12) and optionally its parent + device to a name (of the form mdev_[_]). -def mdev_uuid2name(mdev_uuid: str) -> str: - """Convert an mdev uuid (of the form 8-4-4-4-12) to a name (of the form - mdev_). + :param mdev_uuid: the uuid of the mediated device + :param parent: the parent device id for the mediated device + :returns: name of the mdev to reference in libvirt """ - return "mdev_" + mdev_uuid.replace('-', '_') + name = "mdev_" + mdev_uuid.replace('-', '_') + if parent and parent.startswith('pci_'): + name = name + parent[4:] + return name def get_flags_by_flavor_specs(flavor: 'objects.Flavor') -> ty.Set[str]: From d92d0934188a14741dd86949ddf98bd1208f3d96 Mon Sep 17 00:00:00 2001 From: Sean Mooney Date: Tue, 8 Nov 2022 15:00:22 +0000 Subject: [PATCH 10/73] Support multiple config file with mod_wsgi Unlike uwsgi, apache mod_wsgi does not support passing commandline arguments to the python wsgi script it invokes. As a result while you can pass --config-file when hosting the api and metadata wsgi applications with uwsgi there is no way to use multiple config files with mod_wsgi. This change mirrors how this is supported in keystone today by intoducing a new OS_NOVA_CONFIG_FILES env var to allow operators to optional pass a ';' delimited list of config files to load. This change also add docs for this env var and the existing undocumented OS_NOVA_CONFIG_DIR. Closes-Bug: 1994056 Change-Id: I8e3ccd75cbb7f2e132b403cb38022787c2c0a37b (cherry picked from commit 73fe84fa0ea6f7c7fa55544f6bce5326d87743a6) --- doc/source/user/wsgi.rst | 14 ++++++++++---- nova/api/openstack/wsgi_app.py | 5 ++++- nova/tests/unit/api/openstack/test_wsgi_app.py | 15 +++++++++++++++ ...nfig-files-with-mod_wsgi-f114ea5fdd8b9a51.yaml | 14 ++++++++++++++ 4 files changed, 43 insertions(+), 5 deletions(-) create mode 100644 releasenotes/notes/multiple-config-files-with-mod_wsgi-f114ea5fdd8b9a51.yaml diff --git a/doc/source/user/wsgi.rst b/doc/source/user/wsgi.rst index 6b314b4832e..63f949df1af 100644 --- a/doc/source/user/wsgi.rst +++ b/doc/source/user/wsgi.rst @@ -8,10 +8,16 @@ as Apache_ or nginx_). The nova project provides two automatically generated entry points that support this: ``nova-api-wsgi`` and ``nova-metadata-wsgi``. These read -``nova.conf`` and ``api-paste.ini`` and generate the required module-level -``application`` that most WSGI servers require. If nova is installed using pip, -these two scripts will be installed into whatever the expected ``bin`` -directory is for the environment. +``nova.conf`` and ``api-paste.ini`` by default and generate the required +module-level ``application`` that most WSGI servers require. +If nova is installed using pip, these two scripts will be installed into +whatever the expected ``bin`` directory is for the environment. + +The config files and config directory can be overridden via the +``OS_NOVA_CONFIG_FILES`` and ``OS_NOVA_CONFIG_DIR`` environment variables. +File paths listed in ``OS_NOVA_CONFIG_FILES`` are relative to +``OS_NOVA_CONFIG_DIR`` and delimited by ``;``. + The new scripts replace older experimental scripts that could be found in the ``nova/wsgi`` directory of the code repository. The new scripts are *not* diff --git a/nova/api/openstack/wsgi_app.py b/nova/api/openstack/wsgi_app.py index d60069ce844..6a2b72a6111 100644 --- a/nova/api/openstack/wsgi_app.py +++ b/nova/api/openstack/wsgi_app.py @@ -42,8 +42,11 @@ def _get_config_files(env=None): if env is None: env = os.environ dirname = env.get('OS_NOVA_CONFIG_DIR', '/etc/nova').strip() + files = env.get('OS_NOVA_CONFIG_FILES', '').split(';') + if files == ['']: + files = CONFIG_FILES return [os.path.join(dirname, config_file) - for config_file in CONFIG_FILES] + for config_file in files] def _setup_service(host, name): diff --git a/nova/tests/unit/api/openstack/test_wsgi_app.py b/nova/tests/unit/api/openstack/test_wsgi_app.py index 94e2fe5cb15..0eb7011c116 100644 --- a/nova/tests/unit/api/openstack/test_wsgi_app.py +++ b/nova/tests/unit/api/openstack/test_wsgi_app.py @@ -104,3 +104,18 @@ def test_setup_service_version_workaround(self, mock_check_old, mock_get): 'disable_compute_service_check_for_ffu', True, group='workarounds') wsgi_app._setup_service('myhost', 'api') + + def test__get_config_files_empty_env(self): + env = {} + result = wsgi_app._get_config_files(env) + expected = ['/etc/nova/api-paste.ini', '/etc/nova/nova.conf'] + self.assertEqual(result, expected) + + def test__get_config_files_with_env(self): + env = { + "OS_NOVA_CONFIG_DIR": "/nova", + "OS_NOVA_CONFIG_FILES": "api.conf", + } + result = wsgi_app._get_config_files(env) + expected = ['/nova/api.conf'] + self.assertEqual(result, expected) diff --git a/releasenotes/notes/multiple-config-files-with-mod_wsgi-f114ea5fdd8b9a51.yaml b/releasenotes/notes/multiple-config-files-with-mod_wsgi-f114ea5fdd8b9a51.yaml new file mode 100644 index 00000000000..f4361477deb --- /dev/null +++ b/releasenotes/notes/multiple-config-files-with-mod_wsgi-f114ea5fdd8b9a51.yaml @@ -0,0 +1,14 @@ +--- +fixes: + - | + apache mod_wsgi does not support passing commandline arguments to the wsgi + application that it hosts. As a result when the nova api or metadata api + where run under mod_wsgi it was not posible to use multiple config files + or non-default file names i.e. nova-api.conf + This has been adressed by the intoduction of a new, optional, envionment + varible ``OS_NOVA_CONFIG_FILES``. ``OS_NOVA_CONFIG_FILES`` is a ``;`` + seperated list fo file path relitive to ``OS_NOVA_CONFIG_DIR``. + When unset the default ``api-paste.ini`` and ``nova.conf`` will be used + form ``/etc/nova``. This is supported for the nova api and nova metadata + wsgi applications. + From c9de185ea1ac1e8d4435c5863b2ad7cefdb28c76 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Wed, 16 Nov 2022 17:12:40 +0000 Subject: [PATCH 11/73] Ironic nodes with instance reserved in placement Currently, when you delete an ironic instance, we trigger and undeploy in ironic and we release our allocation in placement. We do this well before the ironic node is actually available. We have attempted to fix this my marking unavailable nodes as reserved in placement. This works great until you try and re-image lots of nodes. It turns out, ironic nodes that are waiting for their automatic clean to finish, are returned as a valid allocation candidates for quite some time. Eventually we mark then as reserved. This patch takes a strange approach, if we mark all nodes as reserved as soon as the instance lands, we close the race. That is, when the allocation is removed the node is still unavailable until the next update of placement is done and notices that the node has become available. That may or may not have been after automatic cleaning. The trade off is that when you don't have automatic cleaning, we wait a bit longer to notice the node is available again. Note, this is also useful when a broken Ironic node is marked as in-maintainance while it is in-use by a nova instance. In a similar way, we mark the Nova as reserved immmeidately, rather than first waiting for the instance to be deleted before reserving the resources in Placement. Closes-Bug: #1974070 Change-Id: Iab92124b5776a799c7f90d07281d28fcf191c8fe (cherry picked from commit 3c022e968375c1b2eadf3c2dd7190b9434c6d4c1) --- nova/conf/workarounds.py | 15 ++++++ nova/tests/unit/virt/ironic/test_driver.py | 48 +++++++++++++++++-- nova/virt/ironic/driver.py | 26 ++++++---- ...ronic-scheduler-race-08cf8aba0365f512.yaml | 11 +++++ 4 files changed, 89 insertions(+), 11 deletions(-) create mode 100644 releasenotes/notes/fix-ironic-scheduler-race-08cf8aba0365f512.yaml diff --git a/nova/conf/workarounds.py b/nova/conf/workarounds.py index 2ec53282cdb..1116664d36d 100644 --- a/nova/conf/workarounds.py +++ b/nova/conf/workarounds.py @@ -416,6 +416,21 @@ help=""" When this is enabled, it will skip version-checking of hypervisors during live migration. +"""), + cfg.BoolOpt( + 'skip_reserve_in_use_ironic_nodes', + default=False, + help=""" +This may be useful if you use the Ironic driver, but don't have +automatic cleaning enabled in Ironic. Nova, by default, will mark +Ironic nodes as reserved as soon as they are in use. When you free +the Ironic node (by deleting the nova instance) it takes a while +for Nova to un-reserve that Ironic node in placement. Usually this +is a good idea, because it avoids placement providing an Ironic +as a valid candidate when it is still being cleaned. +Howerver, if you don't use automatic cleaning, it can cause an +extra delay before and Ironic node is available for building a +new Nova instance. """), ] diff --git a/nova/tests/unit/virt/ironic/test_driver.py b/nova/tests/unit/virt/ironic/test_driver.py index 6ac7ca464e5..958623f31a2 100644 --- a/nova/tests/unit/virt/ironic/test_driver.py +++ b/nova/tests/unit/virt/ironic/test_driver.py @@ -932,6 +932,48 @@ def test_update_provider_tree_with_rc_occupied(self, mock_nfc, mock_nr, self.driver.update_provider_tree(self.ptree, mock.sentinel.nodename) + expected = { + 'CUSTOM_IRON_NFV': { + 'total': 1, + 'reserved': 1, + 'min_unit': 1, + 'max_unit': 1, + 'step_size': 1, + 'allocation_ratio': 1.0, + }, + } + mock_nfc.assert_called_once_with(mock.sentinel.nodename) + mock_nr.assert_called_once_with(mock_nfc.return_value) + mock_res_used.assert_called_once_with(mock_nfc.return_value) + mock_res_unavail.assert_called_once_with(mock_nfc.return_value) + result = self.ptree.data(mock.sentinel.nodename).inventory + self.assertEqual(expected, result) + + @mock.patch.object(ironic_driver.IronicDriver, + '_node_resources_used', return_value=True) + @mock.patch.object(ironic_driver.IronicDriver, + '_node_resources_unavailable', return_value=False) + @mock.patch.object(ironic_driver.IronicDriver, '_node_resource') + @mock.patch.object(ironic_driver.IronicDriver, '_node_from_cache') + def test_update_provider_tree_with_rc_occupied_workaround(self, + mock_nfc, mock_nr, mock_res_unavail, mock_res_used): + """Ensure that when a node is used, we report the inventory matching + the consumed resources. + """ + self.flags(skip_reserve_in_use_ironic_nodes=True, + group="workarounds") + mock_nr.return_value = { + 'vcpus': 24, + 'vcpus_used': 24, + 'memory_mb': 1024, + 'memory_mb_used': 1024, + 'local_gb': 100, + 'local_gb_used': 100, + 'resource_class': 'iron-nfv', + } + + self.driver.update_provider_tree(self.ptree, mock.sentinel.nodename) + expected = { 'CUSTOM_IRON_NFV': { 'total': 1, @@ -945,7 +987,7 @@ def test_update_provider_tree_with_rc_occupied(self, mock_nfc, mock_nr, mock_nfc.assert_called_once_with(mock.sentinel.nodename) mock_nr.assert_called_once_with(mock_nfc.return_value) mock_res_used.assert_called_once_with(mock_nfc.return_value) - self.assertFalse(mock_res_unavail.called) + mock_res_unavail.assert_called_once_with(mock_nfc.return_value) result = self.ptree.data(mock.sentinel.nodename).inventory self.assertEqual(expected, result) @@ -1016,7 +1058,7 @@ def test_update_provider_tree_no_traits(self, mock_nfc, mock_nr, mock_nfc.assert_called_once_with(mock.sentinel.nodename) mock_nr.assert_called_once_with(mock_nfc.return_value) mock_res_used.assert_called_once_with(mock_nfc.return_value) - self.assertFalse(mock_res_unavail.called) + mock_res_unavail.assert_called_once_with(mock_nfc.return_value) result = self.ptree.data(mock.sentinel.nodename).traits self.assertEqual(set(), result) @@ -1048,7 +1090,7 @@ def test_update_provider_tree_with_traits(self, mock_nfc, mock_nr, mock_nfc.assert_called_once_with(mock.sentinel.nodename) mock_nr.assert_called_once_with(mock_nfc.return_value) mock_res_used.assert_called_once_with(mock_nfc.return_value) - self.assertFalse(mock_res_unavail.called) + mock_res_unavail.assert_called_once_with(mock_nfc.return_value) result = self.ptree.data(mock.sentinel.nodename).traits self.assertEqual(set(traits), result) diff --git a/nova/virt/ironic/driver.py b/nova/virt/ironic/driver.py index 7496db5a7cd..5583ac52360 100644 --- a/nova/virt/ironic/driver.py +++ b/nova/virt/ironic/driver.py @@ -874,15 +874,25 @@ def update_provider_tree(self, provider_tree, nodename, allocations=None): """ # nodename is the ironic node's UUID. node = self._node_from_cache(nodename) + reserved = False - if (not self._node_resources_used(node) and - self._node_resources_unavailable(node)): - LOG.debug('Node %(node)s is not ready for a deployment, ' - 'reporting resources as reserved for it. Node\'s ' - 'provision state is %(prov)s, power state is ' - '%(power)s and maintenance is %(maint)s.', - {'node': node.uuid, 'prov': node.provision_state, - 'power': node.power_state, 'maint': node.maintenance}) + if self._node_resources_unavailable(node): + # Operators might mark a node as in maintainance, + # even when an instance is on the node, + # either way lets mark this as reserved + reserved = True + + if (self._node_resources_used(node) and + not CONF.workarounds.skip_reserve_in_use_ironic_nodes): + # Make resources as reserved once we have + # and instance here. + # When the allocation is deleted, most likely + # automatic clean will start, so we keep the node + # reserved until it becomes available again. + # In the case without automatic clean, once + # the allocation is removed in placement it + # also stays as reserved until we notice on + # the next periodic its actually available. reserved = True info = self._node_resource(node) diff --git a/releasenotes/notes/fix-ironic-scheduler-race-08cf8aba0365f512.yaml b/releasenotes/notes/fix-ironic-scheduler-race-08cf8aba0365f512.yaml new file mode 100644 index 00000000000..4fd2cc1ca9f --- /dev/null +++ b/releasenotes/notes/fix-ironic-scheduler-race-08cf8aba0365f512.yaml @@ -0,0 +1,11 @@ +--- +fixes: + - | + Fixed when placement returns ironic nodes that have just started automatic + cleaning as possible valid candidates. This is done by marking all ironic + nodes with an instance on them as reserved, such that nova only makes them + available once we have double checked Ironic reports the node as available. + If you don't have automatic cleaning on, this might mean it takes longer + than normal for Ironic nodes to become available for new instances. + If you want the old behaviour use the following workaround config: + `[workarounds]skip_reserve_in_use_ironic_nodes=true` From d71e9f6ec4933f9430db55537a36678b16ce895a Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Wed, 18 May 2022 19:06:36 +0100 Subject: [PATCH 12/73] Ironic: retry when node not available After a baremetal instance is deleted, and its allocation is removed in placement, the ironic node might start cleaning. Eventually nova will notice and update the inventory to be reserved. During this window, a new instance may have already picked this ironic node. When that race happens today the build fails with an error: "Failed to reserve node ..." This change tries to ensure the remaining alternative hosts are attempted before aborting the build. Clearly the race is still there, but this makes it less painful. Related-Bug: #1974070 Change-Id: Ie5cdc17219c86927ab3769605808cb9d9fa9fa4d (cherry picked from commit 8a476061c5e034016668cd9e5a20c4430ef6b68d) --- nova/compute/manager.py | 3 +- nova/tests/unit/compute/test_compute_mgr.py | 36 +++++++++++++++++++++ nova/tests/unit/virt/ironic/test_driver.py | 22 +++++++++++-- nova/virt/ironic/driver.py | 12 +++++++ 4 files changed, 70 insertions(+), 3 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index f25d037c504..d29348097fb 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -2736,7 +2736,8 @@ def _build_resources(self, context, instance, requested_networks, block_device_mapping) resources['block_device_info'] = block_device_info except (exception.InstanceNotFound, - exception.UnexpectedDeletingTaskStateError): + exception.UnexpectedDeletingTaskStateError, + exception.ComputeResourcesUnavailable): with excutils.save_and_reraise_exception(): self._build_resources_cleanup(instance, network_info) except (exception.UnexpectedTaskStateError, diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 1a4935f482a..e521283acc8 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -7925,6 +7925,42 @@ def test_failed_bdm_prep_from_delete_raises_unexpected(self, mock_clean, mock_prepspawn.assert_called_once_with(self.instance) mock_failedspawn.assert_called_once_with(self.instance) + @mock.patch.object(virt_driver.ComputeDriver, 'failed_spawn_cleanup') + @mock.patch.object(virt_driver.ComputeDriver, 'prepare_for_spawn') + @mock.patch.object(virt_driver.ComputeDriver, + 'prepare_networks_before_block_device_mapping') + @mock.patch.object(virt_driver.ComputeDriver, + 'clean_networks_preparation') + def test_failed_prepare_for_spawn(self, mock_clean, mock_prepnet, + mock_prepspawn, mock_failedspawn): + mock_prepspawn.side_effect = exception.ComputeResourcesUnavailable( + reason="asdf") + with mock.patch.object(self.compute, + '_build_networks_for_instance', + return_value=self.network_info + ) as _build_networks_for_instance: + + try: + with self.compute._build_resources(self.context, self.instance, + self.requested_networks, self.security_groups, + self.image, self.block_device_mapping, + self.resource_provider_mapping, self.accel_uuids): + pass + except Exception as e: + self.assertIsInstance(e, + exception.ComputeResourcesUnavailable) + + _build_networks_for_instance.assert_has_calls( + [mock.call(self.context, self.instance, + self.requested_networks, self.security_groups, + self.resource_provider_mapping, + self.network_arqs)]) + + mock_prepnet.assert_not_called() + mock_clean.assert_called_once_with(self.instance, self.network_info) + mock_prepspawn.assert_called_once_with(self.instance) + mock_failedspawn.assert_called_once_with(self.instance) + @mock.patch.object(virt_driver.ComputeDriver, 'failed_spawn_cleanup') @mock.patch.object(virt_driver.ComputeDriver, 'prepare_for_spawn') @mock.patch.object(manager.ComputeManager, '_build_networks_for_instance') diff --git a/nova/tests/unit/virt/ironic/test_driver.py b/nova/tests/unit/virt/ironic/test_driver.py index 6ac7ca464e5..9b5e31db831 100644 --- a/nova/tests/unit/virt/ironic/test_driver.py +++ b/nova/tests/unit/virt/ironic/test_driver.py @@ -2500,7 +2500,10 @@ def test_ironicclient_bad_response(self, mock_error): @mock.patch.object(cw.IronicClientWrapper, 'call') def test_prepare_for_spawn(self, mock_call): - node = ironic_utils.get_test_node(driver='fake') + node = ironic_utils.get_test_node( + driver='fake', instance_uuid=None, + provision_state=ironic_states.AVAILABLE, + power_state=ironic_states.POWER_OFF) self.mock_conn.get_node.return_value = node instance = fake_instance.fake_instance_obj(self.ctx, node=node.uuid) @@ -2532,7 +2535,10 @@ def test_prepare_for_spawn_invalid_instance(self): instance) def test_prepare_for_spawn_conflict(self): - node = ironic_utils.get_test_node(driver='fake') + node = ironic_utils.get_test_node( + driver='fake', instance_uuid=None, + provision_state=ironic_states.AVAILABLE, + power_state=ironic_states.POWER_OFF) self.mock_conn.get_node.return_value = node self.mock_conn.update_node.side_effect = sdk_exc.ConflictException instance = fake_instance.fake_instance_obj(self.ctx, node=node.id) @@ -2540,6 +2546,18 @@ def test_prepare_for_spawn_conflict(self): self.driver.prepare_for_spawn, instance) + def test_prepare_for_spawn_not_available(self): + node = ironic_utils.get_test_node( + driver='fake', instance_uuid=None, + provision_state=ironic_states.CLEANWAIT, + power_state=ironic_states.POWER_OFF) + self.mock_conn.get_node.return_value = node + self.mock_conn.update_node.side_effect = sdk_exc.ConflictException + instance = fake_instance.fake_instance_obj(self.ctx, node=node.id) + self.assertRaises(exception.ComputeResourcesUnavailable, + self.driver.prepare_for_spawn, + instance) + @mock.patch.object(ironic_driver.IronicDriver, '_cleanup_deploy') def test_failed_spawn_cleanup(self, mock_cleanup): node = ironic_utils.get_test_node(driver='fake') diff --git a/nova/virt/ironic/driver.py b/nova/virt/ironic/driver.py index 7496db5a7cd..5f5f3a6dd79 100644 --- a/nova/virt/ironic/driver.py +++ b/nova/virt/ironic/driver.py @@ -397,6 +397,18 @@ def prepare_for_spawn(self, instance): _("Ironic node uuid not supplied to " "driver for instance %s.") % instance.uuid) node = self._get_node(node_uuid) + + # Its possible this node has just moved from deleting + # to cleaning. Placement will update the inventory + # as all reserved, but this instance might have got here + # before that happened, but after the previous allocation + # got deleted. We trigger a re-schedule to another node. + if (self._node_resources_used(node) or + self._node_resources_unavailable(node)): + msg = "Chosen ironic node %s is not available" % node_uuid + LOG.info(msg, instance=instance) + raise exception.ComputeResourcesUnavailable(reason=msg) + self._set_instance_id(node, instance) def failed_spawn_cleanup(self, instance): From 03374cf4a2ff98c938691a209d6a3fb14a06d3a0 Mon Sep 17 00:00:00 2001 From: Jorge San Emeterio Date: Tue, 11 Oct 2022 13:14:12 +0200 Subject: [PATCH 13/73] Improving logging at '_allocate_mdevs'. Adding both 'info' and 'debug' messages with the intention of telling which mdevs are available, which get allocated and whether new ones are created. Closes-Bug: #1992451 Change-Id: Ibd331df51fd4eaeed4831a98469f06a4ce0cd452 (cherry picked from commit 6feb3350b048606297068841e3feba110bb0b0ab) --- nova/virt/libvirt/driver.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index ce884dfe306..be458dbd5b7 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -8323,6 +8323,7 @@ def _get_existing_mdevs_not_assigned(self, parent, requested_types=None): :param requested_types: Filter out the result for only mediated devices having those types. """ + LOG.debug('Searching for available mdevs...') allocated_mdevs = self._get_all_assigned_mediated_devices() mdevs = self._get_mediated_devices(requested_types) available_mdevs = set() @@ -8338,6 +8339,7 @@ def _get_existing_mdevs_not_assigned(self, parent, requested_types=None): available_mdevs.add(mdev["uuid"]) available_mdevs -= set(allocated_mdevs) + LOG.info('Available mdevs at: %s.', available_mdevs) return available_mdevs def _create_new_mediated_device(self, parent, uuid=None): @@ -8349,6 +8351,7 @@ def _create_new_mediated_device(self, parent, uuid=None): :returns: the newly created mdev UUID or None if not possible """ + LOG.debug('Attempting to create new mdev...') supported_types = self.supported_vgpu_types # Try to see if we can still create a new mediated device devices = self._get_mdev_capable_devices(supported_types) @@ -8360,6 +8363,7 @@ def _create_new_mediated_device(self, parent, uuid=None): # The device is not the one that was called, not creating # the mdev continue + LOG.debug('Trying on: %s.', dev_name) dev_supported_type = self._get_vgpu_type_per_pgpu(dev_name) if dev_supported_type and device['types'][ dev_supported_type]['availableInstances'] > 0: @@ -8369,7 +8373,13 @@ def _create_new_mediated_device(self, parent, uuid=None): pci_addr = "{}:{}:{}.{}".format(*dev_name[4:].split('_')) chosen_mdev = nova.privsep.libvirt.create_mdev( pci_addr, dev_supported_type, uuid=uuid) + LOG.info('Created mdev: %s on pGPU: %s.', + chosen_mdev, pci_addr) return chosen_mdev + LOG.debug('Failed: No available instances on device.') + LOG.info('Failed to create mdev. ' + 'No free space found among the following devices: %s.', + [dev['dev_id'] for dev in devices]) @utils.synchronized(VGPU_RESOURCE_SEMAPHORE) def _allocate_mdevs(self, allocations): @@ -8452,6 +8462,8 @@ def _allocate_mdevs(self, allocations): # Take the first available mdev chosen_mdev = mdevs_available.pop() else: + LOG.debug('No available mdevs where found. ' + 'Creating an new one...') chosen_mdev = self._create_new_mediated_device(parent_device) if not chosen_mdev: # If we can't find devices having available VGPUs, just raise @@ -8459,6 +8471,7 @@ def _allocate_mdevs(self, allocations): reason='mdev-capable resource is not available') else: chosen_mdevs.append(chosen_mdev) + LOG.info('Allocated mdev: %s.', chosen_mdev) return chosen_mdevs def _detach_mediated_devices(self, guest): From ea789a62220633d1b5d985d8eb2ec7189bc92a02 Mon Sep 17 00:00:00 2001 From: Rajesh Tailor Date: Fri, 11 Nov 2022 16:03:53 +0530 Subject: [PATCH 14/73] Correct config help message related options The options list in 'Related Options:' section doesn't rendered as bulleted list for some params because of missing blank line. This changes adds missing blank line wherever needed in [1]. [1] https://docs.openstack.org/nova/latest/configuration/config.html Change-Id: I7077aea2abcf3cab67592879ebd1fde066bfcac5 (cherry picked from commit ac42c43e431b2bd1089910cd52aec8552a8e9755) --- nova/conf/ironic.py | 1 + nova/conf/libvirt.py | 1 + nova/conf/mks.py | 2 ++ nova/conf/vmware.py | 5 +++++ 4 files changed, 9 insertions(+) diff --git a/nova/conf/ironic.py b/nova/conf/ironic.py index dc5d2412c4c..2734f2b78ac 100644 --- a/nova/conf/ironic.py +++ b/nova/conf/ironic.py @@ -27,6 +27,7 @@ help=""" Configuration options for Ironic driver (Bare Metal). If using the Ironic driver following options must be set: + * auth_type * auth_url * project_name diff --git a/nova/conf/libvirt.py b/nova/conf/libvirt.py index 4ea37b8fe97..16a3f630902 100644 --- a/nova/conf/libvirt.py +++ b/nova/conf/libvirt.py @@ -987,6 +987,7 @@ according to the poll interval. Related options: + * images_type - must be set to ``rbd`` * images_rbd_glance_store_name - must be set to a store name * images_rbd_glance_copy_poll_interval - controls the failure time-to-notice diff --git a/nova/conf/mks.py b/nova/conf/mks.py index 1703f5f2404..ec403a1a4fc 100644 --- a/nova/conf/mks.py +++ b/nova/conf/mks.py @@ -23,7 +23,9 @@ instance console access to VM's created by VMware hypervisors. Related options: + Following options must be set to provide console access. + * mksproxy_base_url * enabled """) diff --git a/nova/conf/vmware.py b/nova/conf/vmware.py index 63a5f04ea4a..17a2676b643 100644 --- a/nova/conf/vmware.py +++ b/nova/conf/vmware.py @@ -76,7 +76,9 @@ * Any valid URI (The scheme is 'telnet' or 'telnets'.) Related options: + This option is ignored if serial_port_service_uri is not specified. + * serial_port_service_uri """), cfg.StrOpt('serial_log_dir', @@ -112,6 +114,7 @@ then the default CA truststore is used for verification. Related options: + * ca_file: This option is ignored if "ca_file" is set. """), cfg.StrOpt('cluster_name', @@ -158,7 +161,9 @@ * Any valid port number within 5900 -(5900 + vnc_port_total) Related options: + Below options should be set to enable VNC client. + * vnc.enabled = True * vnc_port_total """), From 6e8ed78470edbae7b58d75e9e9f4f62bdb30a170 Mon Sep 17 00:00:00 2001 From: Dan Smith Date: Thu, 10 Nov 2022 09:55:48 -0800 Subject: [PATCH 15/73] [stable-only][cve] Check VMDK create-type against an allowed list NOTE(sbauza): Stable policy allows us to proactively merge a backport without waiting for the parent patch to be merged (exception to rule #4 in [1]. Marking [stable-only] in order to silence nova-tox-validate-backport [1] https://docs.openstack.org/project-team-guide/stable-branches.html#appropriate-fixes Related-Bug: #1996188 Change-Id: I5a399f1d3d702bfb76c067893e9c924904c8c360 --- nova/conf/compute.py | 9 ++++++ nova/tests/unit/virt/test_images.py | 46 +++++++++++++++++++++++++++++ nova/virt/images.py | 31 +++++++++++++++++++ 3 files changed, 86 insertions(+) diff --git a/nova/conf/compute.py b/nova/conf/compute.py index 71707cfd531..9675fff8293 100644 --- a/nova/conf/compute.py +++ b/nova/conf/compute.py @@ -1015,6 +1015,15 @@ * ``[scheduler]query_placement_for_image_type_support`` - enables filtering computes based on supported image types, which is required to be enabled for this to take effect. +"""), + cfg.ListOpt('vmdk_allowed_types', + default=['streamOptimized', 'monolithicSparse'], + help=""" +A list of strings describing allowed VMDK "create-type" subformats +that will be allowed. This is recommended to only include +single-file-with-sparse-header variants to avoid potential host file +exposure due to processing named extents. If this list is empty, then no +form of VMDK image will be allowed. """), cfg.BoolOpt('packing_host_numa_cells_allocation_strategy', default=False, diff --git a/nova/tests/unit/virt/test_images.py b/nova/tests/unit/virt/test_images.py index 58581d93ba1..62a61c1e8b7 100644 --- a/nova/tests/unit/virt/test_images.py +++ b/nova/tests/unit/virt/test_images.py @@ -16,6 +16,8 @@ from unittest import mock from oslo_concurrency import processutils +from oslo_serialization import jsonutils +from oslo_utils import imageutils from nova.compute import utils as compute_utils from nova import exception @@ -135,3 +137,47 @@ def test_convert_image_without_direct_io_support(self, mock_execute, '-O', 'out_format', '-f', 'in_format', 'source', 'dest') mock_disk_op_sema.__enter__.assert_called_once() self.assertTupleEqual(expected, mock_execute.call_args[0]) + + def test_convert_image_vmdk_allowed_list_checking(self): + info = {'format': 'vmdk', + 'format-specific': { + 'type': 'vmdk', + 'data': { + 'create-type': 'monolithicFlat', + }}} + + # If the format is not in the allowed list, we should get an error + self.assertRaises(exception.ImageUnacceptable, + images.check_vmdk_image, 'foo', + imageutils.QemuImgInfo(jsonutils.dumps(info), + format='json')) + + # With the format in the allowed list, no error + self.flags(vmdk_allowed_types=['streamOptimized', 'monolithicFlat', + 'monolithicSparse'], + group='compute') + images.check_vmdk_image('foo', + imageutils.QemuImgInfo(jsonutils.dumps(info), + format='json')) + + # With an empty list, allow nothing + self.flags(vmdk_allowed_types=[], group='compute') + self.assertRaises(exception.ImageUnacceptable, + images.check_vmdk_image, 'foo', + imageutils.QemuImgInfo(jsonutils.dumps(info), + format='json')) + + @mock.patch.object(images, 'fetch') + @mock.patch('nova.privsep.qemu.unprivileged_qemu_img_info') + def test_fetch_checks_vmdk_rules(self, mock_info, mock_fetch): + info = {'format': 'vmdk', + 'format-specific': { + 'type': 'vmdk', + 'data': { + 'create-type': 'monolithicFlat', + }}} + mock_info.return_value = jsonutils.dumps(info) + with mock.patch('os.path.exists', return_value=True): + e = self.assertRaises(exception.ImageUnacceptable, + images.fetch_to_raw, None, 'foo', 'anypath') + self.assertIn('Invalid VMDK create-type specified', str(e)) diff --git a/nova/virt/images.py b/nova/virt/images.py index 5358f3766ac..f13c8722909 100644 --- a/nova/virt/images.py +++ b/nova/virt/images.py @@ -110,6 +110,34 @@ def get_info(context, image_href): return IMAGE_API.get(context, image_href) +def check_vmdk_image(image_id, data): + # Check some rules about VMDK files. Specifically we want to make + # sure that the "create-type" of the image is one that we allow. + # Some types of VMDK files can reference files outside the disk + # image and we do not want to allow those for obvious reasons. + + types = CONF.compute.vmdk_allowed_types + + if not len(types): + LOG.warning('Refusing to allow VMDK image as vmdk_allowed_' + 'types is empty') + msg = _('Invalid VMDK create-type specified') + raise exception.ImageUnacceptable(image_id=image_id, reason=msg) + + try: + create_type = data.format_specific['data']['create-type'] + except KeyError: + msg = _('Unable to determine VMDK create-type') + raise exception.ImageUnacceptable(image_id=image_id, reason=msg) + + if create_type not in CONF.compute.vmdk_allowed_types: + LOG.warning('Refusing to process VMDK file with create-type of %r ' + 'which is not in allowed set of: %s', create_type, + ','.join(CONF.compute.vmdk_allowed_types)) + msg = _('Invalid VMDK create-type specified') + raise exception.ImageUnacceptable(image_id=image_id, reason=msg) + + def fetch_to_raw(context, image_href, path, trusted_certs=None): path_tmp = "%s.part" % path fetch(context, image_href, path_tmp, trusted_certs) @@ -129,6 +157,9 @@ def fetch_to_raw(context, image_href, path, trusted_certs=None): reason=(_("fmt=%(fmt)s backed by: %(backing_file)s") % {'fmt': fmt, 'backing_file': backing_file})) + if fmt == 'vmdk': + check_vmdk_image(image_href, data) + if fmt != "raw" and CONF.force_raw_images: staged = "%s.converted" % path LOG.debug("%s was %s, converting to raw", image_href, fmt) From 2ea2b556da5f10d662641bd96b0a07735d2b9607 Mon Sep 17 00:00:00 2001 From: Alexey Stupnikov Date: Fri, 8 Jul 2022 17:56:38 +0200 Subject: [PATCH 16/73] Remove deleted projects from flavor access list Previously Nova was unable to remove deleted projects from flavor's access lists. This patch lifts described limitation and improves logic of nova/api/openstack/identity.py library by introducing two separate kinds of exceptions: - webob.exc.HTTPInternalServerError is raised when Keystone identity service version 3.0 was not found. - webob.exc.HTTPBadRequest is raised when specified project is not found. Closes-bug: #1980845 Change-Id: Icbf3bdd944f9a6c38f25ddea0b521ca48ee87a7f (cherry picked from commit 8c6daaacbedc33e738ce85aec0ead5f6947d60bf) --- nova/api/openstack/compute/flavor_access.py | 9 ++++++- nova/api/openstack/identity.py | 22 +++++++++------- .../openstack/compute/test_flavor_access.py | 25 ++++++++++++++++++- 3 files changed, 45 insertions(+), 11 deletions(-) diff --git a/nova/api/openstack/compute/flavor_access.py b/nova/api/openstack/compute/flavor_access.py index e17e6f0ddcd..fc8df15db5b 100644 --- a/nova/api/openstack/compute/flavor_access.py +++ b/nova/api/openstack/compute/flavor_access.py @@ -93,7 +93,14 @@ def _remove_tenant_access(self, req, id, body): vals = body['removeTenantAccess'] tenant = vals['tenant'] - identity.verify_project_id(context, tenant) + # It doesn't really matter if project exists or not: we can delete + # it from flavor's access list in both cases. + try: + identity.verify_project_id(context, tenant) + except webob.exc.HTTPBadRequest as identity_exc: + msg = "Project ID %s is not a valid project." % tenant + if msg not in identity_exc.explanation: + raise # NOTE(gibi): We have to load a flavor from the db here as # flavor.remove_access() will try to emit a notification and that needs diff --git a/nova/api/openstack/identity.py b/nova/api/openstack/identity.py index 7ffc623fede..15ec884aea8 100644 --- a/nova/api/openstack/identity.py +++ b/nova/api/openstack/identity.py @@ -27,24 +27,27 @@ def verify_project_id(context, project_id): """verify that a project_id exists. This attempts to verify that a project id exists. If it does not, - an HTTPBadRequest is emitted. + an HTTPBadRequest is emitted. Also HTTPBadRequest is emitted + if Keystone identity service version 3.0 is not found. """ adap = utils.get_ksa_adapter( 'identity', ksa_auth=context.get_auth_plugin(), min_version=(3, 0), max_version=(3, 'latest')) - failure = webob.exc.HTTPBadRequest( - explanation=_("Project ID %s is not a valid project.") % - project_id) try: resp = adap.get('/projects/%s' % project_id) except kse.EndpointNotFound: LOG.error( - "Keystone identity service version 3.0 was not found. This might " - "be because your endpoint points to the v2.0 versioned endpoint " - "which is not supported. Please fix this.") - raise failure + "Keystone identity service version 3.0 was not found. This " + "might be caused by Nova misconfiguration or Keystone " + "problems.") + msg = _("Nova was unable to find Keystone service endpoint.") + # TODO(astupnik). It may be reasonable to switch to HTTP 503 + # (HTTP Service Unavailable) instead of HTTP Bad Request here. + # If proper Keystone servie is inaccessible, then technially + # this is a server side error and not an error in Nova. + raise webob.exc.HTTPBadRequest(explanation=msg) except kse.ClientException: # something is wrong, like there isn't a keystone v3 endpoint, # or nova isn't configured for the interface to talk to it; @@ -57,7 +60,8 @@ def verify_project_id(context, project_id): return True elif resp.status_code == 404: # we got access, and we know this project is not there - raise failure + msg = _("Project ID %s is not a valid project.") % project_id + raise webob.exc.HTTPBadRequest(explanation=msg) elif resp.status_code == 403: # we don't have enough permission to verify this, so default # to "it's ok". diff --git a/nova/tests/unit/api/openstack/compute/test_flavor_access.py b/nova/tests/unit/api/openstack/compute/test_flavor_access.py index 0581a47c84c..ea9ca2f6328 100644 --- a/nova/tests/unit/api/openstack/compute/test_flavor_access.py +++ b/nova/tests/unit/api/openstack/compute/test_flavor_access.py @@ -353,14 +353,37 @@ def test_add_tenant_access_with_invalid_tenant(self, mock_verify): mock_verify.assert_called_once_with( req.environ['nova.context'], 'proj2') + @mock.patch('nova.objects.Flavor.remove_access') @mock.patch('nova.api.openstack.identity.verify_project_id', side_effect=exc.HTTPBadRequest( explanation="Project ID proj2 is not a valid project.")) - def test_remove_tenant_access_with_invalid_tenant(self, mock_verify): + def test_remove_tenant_access_with_invalid_tenant(self, + mock_verify, + mock_remove_access): """Tests the case that the tenant does not exist in Keystone.""" req = fakes.HTTPRequest.blank(self._prefix + '/flavors/2/action', use_admin_context=True) body = {'removeTenantAccess': {'tenant': 'proj2'}} + + self.flavor_action_controller._remove_tenant_access( + req, '2', body=body) + mock_verify.assert_called_once_with( + req.environ['nova.context'], 'proj2') + mock_remove_access.assert_called_once_with('proj2') + + @mock.patch('nova.api.openstack.identity.verify_project_id', + side_effect=exc.HTTPBadRequest( + explanation="Nova was unable to find Keystone " + "service endpoint.")) + def test_remove_tenant_access_missing_keystone_endpoint(self, + mock_verify): + """Tests the case that Keystone identity service endpoint + version 3.0 was not found. + """ + req = fakes.HTTPRequest.blank(self._prefix + '/flavors/2/action', + use_admin_context=True) + body = {'removeTenantAccess': {'tenant': 'proj2'}} + self.assertRaises(exc.HTTPBadRequest, self.flavor_action_controller._remove_tenant_access, req, '2', body=body) From 1e1e40433c5df3cea099adc2894895701672cbeb Mon Sep 17 00:00:00 2001 From: Kashyap Chamarthy Date: Tue, 17 Jan 2023 11:15:37 +0100 Subject: [PATCH 17/73] libvirt: At start-up rework compareCPU() usage with a workaround In this patch: - Remove the first compareCPU() call (called via the wrapper _compare_cpu()) in _check_cpu_compatibility(), and let libvirt handle it. (QEMU >=2.9 and libvirt >= 4.4.0 are the mininum required versions, and upstream Nova satisfies them by a good margin.) - Validate the user-configured CPU models from _get_cpu_model_mapping(). And take into account all the CPU flags before calling _compare_cpu(). (Suggested-by: Sean Mooney -- thanks!) - Add a workaround to allow skipping the remaining compareCPU() call in _check_cpu_compatibility() as a potential future-proof (because we cannot test all possible CPU models and hardware). Unlike the removed first call, this call takes into account the extra CPU flags provided by the user into account when evaluating guest CPU model compatibility. As a follow up comes the patch[1] that replaces the older libvirt CPU API with the newer one. [1] https://review.opendev.org/c/openstack/nova/+/869950 -- libvirt: Replace usage of compareCPU() with compareHypervisorCPU() Change-Id: I8ef9db851b37c5249d2efbe09a15a1ddbae8205d Signed-off-by: Kashyap Chamarthy (cherry picked from commit 9caaaf1f221063a4329c72c8b67a6015648644a2) --- nova/conf/workarounds.py | 7 +++ nova/tests/unit/virt/libvirt/test_driver.py | 22 ++++++++-- nova/virt/libvirt/driver.py | 47 +++++++++------------ 3 files changed, 46 insertions(+), 30 deletions(-) diff --git a/nova/conf/workarounds.py b/nova/conf/workarounds.py index 1116664d36d..e485ae673a5 100644 --- a/nova/conf/workarounds.py +++ b/nova/conf/workarounds.py @@ -410,6 +410,13 @@ 4.4.0, libvirt will do the correct thing with respect to checking CPU compatibility on the destination host during live migration. """), + cfg.BoolOpt('skip_cpu_compare_at_startup', + default=False, + help=""" +This will skip the CPU comparison call at the startup of Compute +service and lets libvirt handle it. +"""), + cfg.BoolOpt( 'skip_hypervisor_version_check_on_lm', default=False, diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index 86e3661a343..f08c6e2b2f3 100644 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -1330,6 +1330,22 @@ def test__check_cpu_compatibility_advance_model(self, mocked_compare): self.assertRaises(exception.InvalidCPUInfo, drvr.init_host, "dummyhost") + @mock.patch.object(libvirt_driver.LibvirtDriver, + '_register_all_undefined_instance_details', + new=mock.Mock()) + @mock.patch('nova.virt.libvirt.host.libvirt.Connection.compareCPU') + def test__check_cpu_compatibility_skip_compare_at_init( + self, mocked_compare + ): + self.flags(group='workarounds', skip_cpu_compare_at_startup=True) + self.flags(cpu_mode="custom", + cpu_models=["Icelake-Server-noTSX"], + cpu_model_extra_flags = ["-mpx"], + group="libvirt") + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True) + drvr.init_host("dummyhost") + mocked_compare.assert_not_called() + @mock.patch.object(libvirt_driver.LibvirtDriver, '_register_all_undefined_instance_details', new=mock.Mock()) @@ -1343,7 +1359,7 @@ def test__check_cpu_compatibility_with_flag(self): @mock.patch('nova.virt.libvirt.host.libvirt.Connection.compareCPU') def test__check_cpu_compatibility_advance_flag(self, mocked_compare): - mocked_compare.side_effect = (2, 0) + mocked_compare.side_effect = (-1, 0) self.flags(cpu_mode="custom", cpu_models=["qemu64"], cpu_model_extra_flags = ["avx", "avx2"], @@ -1356,7 +1372,7 @@ def test__check_cpu_compatibility_advance_flag(self, mocked_compare): def test__check_cpu_compatibility_wrong_flag(self, mocked_compare): # here, and in the surrounding similar tests, the non-zero error # code in the compareCPU() side effect indicates error - mocked_compare.side_effect = (2, 0) + mocked_compare.side_effect = (-1, 0) self.flags(cpu_mode="custom", cpu_models=["Broadwell-noTSX"], cpu_model_extra_flags = ["a v x"], @@ -1369,7 +1385,7 @@ def test__check_cpu_compatibility_wrong_flag(self, mocked_compare): def test__check_cpu_compatibility_enabled_and_disabled_flags( self, mocked_compare ): - mocked_compare.side_effect = (2, 0) + mocked_compare.side_effect = (-1, 0) self.flags( cpu_mode="custom", cpu_models=["Cascadelake-Server"], diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index ce884dfe306..8a05921f128 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -984,33 +984,26 @@ def _check_cpu_compatibility(self): msg = _("The cpu_models option is required when cpu_mode=custom") raise exception.Invalid(msg) - cpu = vconfig.LibvirtConfigGuestCPU() - for model in models: - cpu.model = self._get_cpu_model_mapping(model) - try: - self._compare_cpu(cpu, self._get_cpu_info(), None) - except exception.InvalidCPUInfo as e: - msg = (_("Configured CPU model: %(model)s is not " - "compatible with host CPU. Please correct your " - "config and try again. %(e)s") % { - 'model': model, 'e': e}) - raise exception.InvalidCPUInfo(msg) - - # Use guest CPU model to check the compatibility between guest CPU and - # configured extra_flags - cpu = vconfig.LibvirtConfigGuestCPU() - cpu.model = self._host.get_capabilities().host.cpu.model - for flag in set(x.lower() for x in CONF.libvirt.cpu_model_extra_flags): - cpu_feature = self._prepare_cpu_flag(flag) - cpu.add_feature(cpu_feature) - try: - self._compare_cpu(cpu, self._get_cpu_info(), None) - except exception.InvalidCPUInfo as e: - msg = (_("Configured extra flag: %(flag)s it not correct, or " - "the host CPU does not support this flag. Please " - "correct the config and try again. %(e)s") % { - 'flag': flag, 'e': e}) - raise exception.InvalidCPUInfo(msg) + if not CONF.workarounds.skip_cpu_compare_at_startup: + # Use guest CPU model to check the compatibility between + # guest CPU and configured extra_flags + for model in models: + cpu = vconfig.LibvirtConfigGuestCPU() + cpu.model = self._get_cpu_model_mapping(model) + for flag in set(x.lower() for + x in CONF.libvirt.cpu_model_extra_flags): + cpu_feature = self._prepare_cpu_flag(flag) + cpu.add_feature(cpu_feature) + try: + self._compare_cpu(cpu, self._get_cpu_info(), None) + except exception.InvalidCPUInfo as e: + msg = (_("Configured CPU model: %(model)s " + "and CPU Flags %(flags)s ar not " + "compatible with host CPU. Please correct your " + "config and try again. %(e)s") % { + 'model': model, 'e': e, + 'flags': CONF.libvirt.cpu_model_extra_flags}) + raise exception.InvalidCPUInfo(msg) def _check_vtpm_support(self) -> None: # TODO(efried): A key manager must be configured to create/retrieve From 71855163a944e437f9c48a5765f683b55a28c720 Mon Sep 17 00:00:00 2001 From: Rajesh Tailor Date: Tue, 11 Oct 2022 18:01:17 +0530 Subject: [PATCH 18/73] Handle InstanceInvalidState exception When instance task state is 'deleting' or 'migrating', then get_vnc_console throws 500 error, as InstanceInvalidState exception is not handled there. This change handles InstanceInvalidState in api layer in get_vnc_console call. Closes-Bug: #1968618 Change-Id: Ia738a0972b050f549f446c85171d3f33e60ada4f (cherry picked from commit ec40d5aee34e9428e2a19231fc3df4d23d75b779) --- nova/api/openstack/compute/remote_consoles.py | 3 +++ .../api_sample_tests/test_remote_consoles.py | 20 +++++++++++++++++++ .../openstack/compute/test_remote_consoles.py | 12 +++++++++++ 3 files changed, 35 insertions(+) diff --git a/nova/api/openstack/compute/remote_consoles.py b/nova/api/openstack/compute/remote_consoles.py index 36015542aa3..7d374ef432e 100644 --- a/nova/api/openstack/compute/remote_consoles.py +++ b/nova/api/openstack/compute/remote_consoles.py @@ -56,6 +56,9 @@ def get_vnc_console(self, req, id, body): raise webob.exc.HTTPNotFound(explanation=e.format_message()) except exception.InstanceNotReady as e: raise webob.exc.HTTPConflict(explanation=e.format_message()) + except exception.InstanceInvalidState as e: + common.raise_http_conflict_for_instance_invalid_state( + e, 'get_vnc_console', id) except NotImplementedError: common.raise_feature_not_supported() diff --git a/nova/tests/functional/api_sample_tests/test_remote_consoles.py b/nova/tests/functional/api_sample_tests/test_remote_consoles.py index 986826bfee0..e304402ee94 100644 --- a/nova/tests/functional/api_sample_tests/test_remote_consoles.py +++ b/nova/tests/functional/api_sample_tests/test_remote_consoles.py @@ -13,6 +13,10 @@ # License for the specific language governing permissions and limitations # under the License. +from unittest import mock + +from nova.compute import api as compute +from nova import exception from nova.tests.functional.api_sample_tests import test_servers HTTP_RE = r'(https?://)([\w\d:#@%/;$()~_?\+-=\\.&](#!)?)*' @@ -38,6 +42,22 @@ def test_get_vnc_console(self): self._verify_response('get-vnc-console-post-resp', {'url': HTTP_RE}, response, 200) + @mock.patch.object(compute.API, 'get_vnc_console') + def test_get_vnc_console_instance_invalid_state(self, + mock_get_vnc_console): + uuid = self._post_server() + + def fake_get_vnc_console(*args, **kwargs): + raise exception.InstanceInvalidState( + attr='fake_attr', state='fake_state', method='fake_method', + instance_uuid=uuid) + + mock_get_vnc_console.side_effect = fake_get_vnc_console + response = self._do_post('servers/%s/action' % uuid, + 'get-vnc-console-post-req', + {'action': 'os-getVNCConsole'}) + self.assertEqual(409, response.status_code) + def test_get_spice_console(self): uuid = self._post_server() response = self._do_post('servers/%s/action' % uuid, diff --git a/nova/tests/unit/api/openstack/compute/test_remote_consoles.py b/nova/tests/unit/api/openstack/compute/test_remote_consoles.py index bd093075677..961f4a02c91 100644 --- a/nova/tests/unit/api/openstack/compute/test_remote_consoles.py +++ b/nova/tests/unit/api/openstack/compute/test_remote_consoles.py @@ -104,6 +104,18 @@ def test_get_vnc_console_no_instance_on_console_get(self): 'get_vnc_console', exception.InstanceNotFound(instance_id=fakes.FAKE_UUID)) + def test_get_vnc_console_instance_invalid_state(self): + body = {'os-getVNCConsole': {'type': 'novnc'}} + self._check_console_failure( + self.controller.get_vnc_console, + webob.exc.HTTPConflict, + body, + 'get_vnc_console', + exception.InstanceInvalidState( + attr='fake-attr', state='fake-state', method='fake-method', + instance_uuid=fakes.FAKE_UUID) + ) + def test_get_vnc_console_invalid_type(self): body = {'os-getVNCConsole': {'type': 'invalid'}} self._check_console_failure( From d00a848a735f98b028f5930798ee69ef205c8e2e Mon Sep 17 00:00:00 2001 From: Rajesh Tailor Date: Wed, 10 Aug 2022 18:15:04 +0530 Subject: [PATCH 19/73] Fix rescue volume-based instance As of now, when attempting to rescue a volume-based instance using an image without the hw_rescue_device and/or hw_rescue_bus properties set, the rescue api call fails (as non-stable rescue for volume-based instances are not supported) leaving the instance in error state. This change checks for hw_rescue_device/hw_rescue_bus image properties before attempting to rescue and if the property is not set, then fail with proper error message, without changing instance state. Related-Bug: #1978958 Closes-Bug: #1926601 Change-Id: Id4c8c5f3b32985ac7d3d7c833b82e0876f7367c1 (cherry picked from commit 6eed55bf55469f4ceaa7d4d4eb1be635e14bc73b) --- nova/compute/api.py | 6 + nova/tests/functional/test_server_rescue.py | 86 +++++++++-- nova/tests/unit/compute/test_api.py | 133 +++++++++++++++++- ...olume-based-instance-c6e3fba236d90be7.yaml | 6 + 4 files changed, 220 insertions(+), 11 deletions(-) create mode 100644 releasenotes/notes/rescue-volume-based-instance-c6e3fba236d90be7.yaml diff --git a/nova/compute/api.py b/nova/compute/api.py index c06fefdd3cf..eebfb2cbe4c 100644 --- a/nova/compute/api.py +++ b/nova/compute/api.py @@ -4697,6 +4697,7 @@ def rescue(self, context, instance, rescue_password=None, allow_bfv_rescue=False): """Rescue the given instance.""" + image_meta = None if rescue_image_ref: try: image_meta = image_meta_obj.ImageMeta.from_image_ref( @@ -4717,6 +4718,8 @@ def rescue(self, context, instance, rescue_password=None, "image properties set") raise exception.UnsupportedRescueImage( image=rescue_image_ref) + else: + image_meta = instance.image_meta bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( context, instance.uuid) @@ -4725,6 +4728,9 @@ def rescue(self, context, instance, rescue_password=None, volume_backed = compute_utils.is_volume_backed_instance( context, instance, bdms) + allow_bfv_rescue &= 'hw_rescue_bus' in image_meta.properties and \ + 'hw_rescue_device' in image_meta.properties + if volume_backed and allow_bfv_rescue: cn = objects.ComputeNode.get_by_host_and_nodename( context, instance.host, instance.node) diff --git a/nova/tests/functional/test_server_rescue.py b/nova/tests/functional/test_server_rescue.py index fa96c10344a..8f5b9129437 100644 --- a/nova/tests/functional/test_server_rescue.py +++ b/nova/tests/functional/test_server_rescue.py @@ -10,6 +10,10 @@ # License for the specific language governing permissions and limitations # under the License. +import datetime + +from oslo_utils.fixture import uuidsentinel as uuids + from nova.tests import fixtures as nova_fixtures from nova.tests.functional.api import client from nova.tests.functional import integrated_helpers @@ -23,7 +27,37 @@ def setUp(self): self.useFixture(nova_fixtures.CinderFixture(self)) self._start_compute(host='host1') - def _create_bfv_server(self): + def _create_image(self, metadata=None): + image = { + 'id': uuids.stable_rescue_image, + 'name': 'fake-image-rescue-property', + 'created_at': datetime.datetime(2011, 1, 1, 1, 2, 3), + 'updated_at': datetime.datetime(2011, 1, 1, 1, 2, 3), + 'deleted_at': None, + 'deleted': False, + 'status': 'active', + 'is_public': False, + 'container_format': 'raw', + 'disk_format': 'raw', + 'size': '25165824', + 'min_ram': 0, + 'min_disk': 0, + 'protected': False, + 'visibility': 'public', + 'tags': ['tag1', 'tag2'], + 'properties': { + 'kernel_id': 'nokernel', + 'ramdisk_id': 'nokernel', + 'hw_rescue_device': 'disk', + 'hw_rescue_bus': 'scsi', + }, + } + if metadata: + image['properties'].update(metadata) + return self.glance.create(None, image) + + def _create_bfv_server(self, metadata=None): + image = self._create_image(metadata=metadata) server_request = self._build_server(networks=[]) server_request.pop('imageRef') server_request['block_device_mapping_v2'] = [{ @@ -33,7 +67,7 @@ def _create_bfv_server(self): 'destination_type': 'volume'}] server = self.api.post_server({'server': server_request}) self._wait_for_state_change(server, 'ACTIVE') - return server + return server, image class DisallowBFVRescuev286(BFVRescue): @@ -43,10 +77,10 @@ class DisallowBFVRescuev286(BFVRescue): microversion = '2.86' def test_bfv_rescue_not_supported(self): - server = self._create_bfv_server() + server, image = self._create_bfv_server() ex = self.assertRaises(client.OpenStackApiException, self.api.post_server_action, server['id'], {'rescue': { - 'rescue_image_ref': '155d900f-4e14-4e4c-a73d-069cbf4541e6'}}) + 'rescue_image_ref': image['id']}}) self.assertEqual(400, ex.response.status_code) self.assertIn('Cannot rescue a volume-backed instance', ex.response.text) @@ -60,10 +94,10 @@ class DisallowBFVRescuev286WithTrait(BFVRescue): microversion = '2.86' def test_bfv_rescue_not_supported(self): - server = self._create_bfv_server() + server, image = self._create_bfv_server() ex = self.assertRaises(client.OpenStackApiException, self.api.post_server_action, server['id'], {'rescue': { - 'rescue_image_ref': '155d900f-4e14-4e4c-a73d-069cbf4541e6'}}) + 'rescue_image_ref': image['id']}}) self.assertEqual(400, ex.response.status_code) self.assertIn('Cannot rescue a volume-backed instance', ex.response.text) @@ -77,10 +111,10 @@ class DisallowBFVRescuev287WithoutTrait(BFVRescue): microversion = '2.87' def test_bfv_rescue_not_supported(self): - server = self._create_bfv_server() + server, image = self._create_bfv_server() ex = self.assertRaises(client.OpenStackApiException, self.api.post_server_action, server['id'], {'rescue': { - 'rescue_image_ref': '155d900f-4e14-4e4c-a73d-069cbf4541e6'}}) + 'rescue_image_ref': image['id']}}) self.assertEqual(400, ex.response.status_code) self.assertIn('Host unable to rescue a volume-backed instance', ex.response.text) @@ -94,7 +128,41 @@ class AllowBFVRescuev287WithTrait(BFVRescue): microversion = '2.87' def test_bfv_rescue_supported(self): - server = self._create_bfv_server() + server, image = self._create_bfv_server() self.api.post_server_action(server['id'], {'rescue': { + 'rescue_image_ref': image['id']}}) + self._wait_for_state_change(server, 'RESCUE') + + +class DisallowBFVRescuev287WithoutRescueImageProperties(BFVRescue): + """Asserts that BFV rescue requests fail with microversion 2.87 (or later) + when the required hw_rescue_device and hw_rescue_bus image properties + are not set on the image. + """ + compute_driver = 'fake.MediumFakeDriver' + microversion = '2.87' + + def test_bfv_rescue_failed(self): + server, image = self._create_bfv_server() + # try rescue without hw_rescue_device and hw_rescue_bus properties set + ex = self.assertRaises(client.OpenStackApiException, + self.api.post_server_action, server['id'], {'rescue': { 'rescue_image_ref': '155d900f-4e14-4e4c-a73d-069cbf4541e6'}}) + self.assertEqual(400, ex.response.status_code) + self.assertIn('Cannot rescue a volume-backed instance', + ex.response.text) + + +class AllowBFVRescuev287WithRescueImageProperties(BFVRescue): + """Asserts that BFV rescue requests pass with microversion 2.87 (or later) + when the required hw_rescue_device and hw_rescue_bus image properties + are set on the image. + """ + compute_driver = 'fake.RescueBFVDriver' + microversion = '2.87' + + def test_bfv_rescue_done(self): + server, image = self._create_bfv_server() + self.api.post_server_action(server['id'], {'rescue': { + 'rescue_image_ref': image['id']}}) self._wait_for_state_change(server, 'RESCUE') diff --git a/nova/tests/unit/compute/test_api.py b/nova/tests/unit/compute/test_api.py index ca72474a4cb..984b7a033d4 100644 --- a/nova/tests/unit/compute/test_api.py +++ b/nova/tests/unit/compute/test_api.py @@ -5790,7 +5790,10 @@ def test_rescue_bfv_with_required_trait(self, mock_get_bdms, destination_type='volume', volume_type=None, snapshot_id=None, volume_id=uuids.volume_id, volume_size=None)]) - rescue_image_meta_obj = image_meta_obj.ImageMeta.from_dict({}) + rescue_image_meta_obj = image_meta_obj.ImageMeta.from_dict({ + 'properties': {'hw_rescue_device': 'disk', + 'hw_rescue_bus': 'scsi'} + }) with test.nested( mock.patch.object(self.compute_api.placementclient, @@ -5842,6 +5845,7 @@ def test_rescue_bfv_with_required_trait(self, mock_get_bdms, # Assert that the instance task state as set in the compute API self.assertEqual(task_states.RESCUING, instance.task_state) + @mock.patch('nova.objects.instance.Instance.image_meta') @mock.patch('nova.objects.compute_node.ComputeNode' '.get_by_host_and_nodename') @mock.patch('nova.compute.utils.is_volume_backed_instance', @@ -5850,7 +5854,8 @@ def test_rescue_bfv_with_required_trait(self, mock_get_bdms, '.get_by_instance_uuid') def test_rescue_bfv_without_required_trait(self, mock_get_bdms, mock_is_volume_backed, - mock_get_cn): + mock_get_cn, + mock_image_meta): instance = self._create_instance_obj() bdms = objects.BlockDeviceMappingList(objects=[ objects.BlockDeviceMapping( @@ -5858,6 +5863,12 @@ def test_rescue_bfv_without_required_trait(self, mock_get_bdms, destination_type='volume', volume_type=None, snapshot_id=None, volume_id=uuids.volume_id, volume_size=None)]) + + instance.image_meta = image_meta_obj.ImageMeta.from_dict({ + 'properties': {'hw_rescue_device': 'disk', + 'hw_rescue_bus': 'scsi'} + }) + with test.nested( mock.patch.object(self.compute_api.placementclient, 'get_provider_traits'), @@ -5895,6 +5906,124 @@ def test_rescue_bfv_without_required_trait(self, mock_get_bdms, mock_get_traits.assert_called_once_with( self.context, uuids.cn) + @mock.patch('nova.objects.image_meta.ImageMeta.from_image_ref') + @mock.patch('nova.objects.compute_node.ComputeNode' + '.get_by_host_and_nodename') + @mock.patch('nova.compute.utils.is_volume_backed_instance', + return_value=True) + @mock.patch('nova.objects.block_device.BlockDeviceMappingList' + '.get_by_instance_uuid') + def test_rescue_bfv_with_required_image_properties( + self, mock_get_bdms, mock_is_volume_backed, mock_get_cn, + mock_image_meta_obj_from_ref): + instance = self._create_instance_obj() + bdms = objects.BlockDeviceMappingList(objects=[ + objects.BlockDeviceMapping( + boot_index=0, image_id=uuids.image_id, source_type='image', + destination_type='volume', volume_type=None, + snapshot_id=None, volume_id=uuids.volume_id, + volume_size=None)]) + rescue_image_meta_obj = image_meta_obj.ImageMeta.from_dict({ + 'properties': {'hw_rescue_device': 'disk', + 'hw_rescue_bus': 'scsi'} + }) + + with test.nested( + mock.patch.object(self.compute_api.placementclient, + 'get_provider_traits'), + mock.patch.object(self.compute_api.volume_api, 'get'), + mock.patch.object(self.compute_api.volume_api, 'check_attached'), + mock.patch.object(instance, 'save'), + mock.patch.object(self.compute_api, '_record_action_start'), + mock.patch.object(self.compute_api.compute_rpcapi, + 'rescue_instance') + ) as ( + mock_get_traits, mock_get_volume, mock_check_attached, + mock_instance_save, mock_record_start, mock_rpcapi_rescue + ): + # Mock out the returned compute node, image_meta, bdms and volume + mock_image_meta_obj_from_ref.return_value = rescue_image_meta_obj + mock_get_bdms.return_value = bdms + mock_get_volume.return_value = mock.sentinel.volume + mock_get_cn.return_value = mock.Mock(uuid=uuids.cn) + + # Ensure the required trait is returned, allowing BFV rescue + mock_trait_info = mock.Mock(traits=[ot.COMPUTE_RESCUE_BFV]) + mock_get_traits.return_value = mock_trait_info + + # Try to rescue the instance + self.compute_api.rescue(self.context, instance, + rescue_image_ref=uuids.rescue_image_id, + allow_bfv_rescue=True) + + # Assert all of the calls made in the compute API + mock_get_bdms.assert_called_once_with(self.context, instance.uuid) + mock_get_volume.assert_called_once_with( + self.context, uuids.volume_id) + mock_check_attached.assert_called_once_with( + self.context, mock.sentinel.volume) + mock_is_volume_backed.assert_called_once_with( + self.context, instance, bdms) + mock_get_cn.assert_called_once_with( + self.context, instance.host, instance.node) + mock_get_traits.assert_called_once_with(self.context, uuids.cn) + mock_instance_save.assert_called_once_with( + expected_task_state=[None]) + mock_record_start.assert_called_once_with( + self.context, instance, instance_actions.RESCUE) + mock_rpcapi_rescue.assert_called_once_with( + self.context, instance=instance, rescue_password=None, + rescue_image_ref=uuids.rescue_image_id, clean_shutdown=True) + + # Assert that the instance task state as set in the compute API + self.assertEqual(task_states.RESCUING, instance.task_state) + + @mock.patch('nova.objects.image_meta.ImageMeta.from_image_ref') + @mock.patch('nova.compute.utils.is_volume_backed_instance', + return_value=True) + @mock.patch('nova.objects.block_device.BlockDeviceMappingList' + '.get_by_instance_uuid') + def test_rescue_bfv_without_required_image_properties( + self, mock_get_bdms, mock_is_volume_backed, + mock_image_meta_obj_from_ref): + instance = self._create_instance_obj() + bdms = objects.BlockDeviceMappingList(objects=[ + objects.BlockDeviceMapping( + boot_index=0, image_id=uuids.image_id, source_type='image', + destination_type='volume', volume_type=None, + snapshot_id=None, volume_id=uuids.volume_id, + volume_size=None)]) + rescue_image_meta_obj = image_meta_obj.ImageMeta.from_dict({ + 'properties': {} + }) + + with test.nested( + mock.patch.object(self.compute_api.volume_api, 'get'), + mock.patch.object(self.compute_api.volume_api, 'check_attached'), + ) as ( + mock_get_volume, mock_check_attached + ): + # Mock out the returned bdms, volume and image_meta + mock_get_bdms.return_value = bdms + mock_get_volume.return_value = mock.sentinel.volume + mock_image_meta_obj_from_ref.return_value = rescue_image_meta_obj + + # Assert that any attempt to rescue a bfv instance on a compute + # node that does not report the COMPUTE_RESCUE_BFV trait fails and + # raises InstanceNotRescuable + self.assertRaises(exception.InstanceNotRescuable, + self.compute_api.rescue, self.context, instance, + rescue_image_ref=None, allow_bfv_rescue=True) + + # Assert the calls made in the compute API prior to the failure + mock_get_bdms.assert_called_once_with(self.context, instance.uuid) + mock_get_volume.assert_called_once_with( + self.context, uuids.volume_id) + mock_check_attached.assert_called_once_with( + self.context, mock.sentinel.volume) + mock_is_volume_backed.assert_called_once_with( + self.context, instance, bdms) + @mock.patch('nova.compute.utils.is_volume_backed_instance', return_value=True) @mock.patch('nova.objects.block_device.BlockDeviceMappingList' diff --git a/releasenotes/notes/rescue-volume-based-instance-c6e3fba236d90be7.yaml b/releasenotes/notes/rescue-volume-based-instance-c6e3fba236d90be7.yaml new file mode 100644 index 00000000000..7e80059b801 --- /dev/null +++ b/releasenotes/notes/rescue-volume-based-instance-c6e3fba236d90be7.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + Fix rescuing volume based instance by adding a check for 'hw_rescue_disk' + and 'hw_rescue_device' properties in image metadata before attempting + to rescue instance. From ec15df83d2d8eb9744438c7129f64a00e5c5694e Mon Sep 17 00:00:00 2001 From: Slawek Kaplonski Date: Fri, 2 Sep 2022 15:58:50 +0200 Subject: [PATCH 20/73] Don't provide MTU value in metadata service if DHCP is enabled For networks with subnets with enabled DHCP service don't provide mtu value in the metadata. That way cloud-init will not configure it "statically" in e.g. netplan's config file and guest OS will use MTU value provided by the DHCP service. Closes-Bug: #1899487 Change-Id: Ib775c2210349b72b3dc033554ac6d8b35b8d2d79 (cherry picked from commit 6bdc79af30151f683c0f462bc6c69da30ebcbcf9) --- nova/tests/unit/virt/test_netutils.py | 23 +++++++++++++++++++ nova/virt/netutils.py | 9 +++++++- ...ks-with-enabled-dhcp-641506f2a13b540f.yaml | 5 ++++ 3 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 releasenotes/notes/Do-not-send-mtu-value-in-metadata-for-networks-with-enabled-dhcp-641506f2a13b540f.yaml diff --git a/nova/tests/unit/virt/test_netutils.py b/nova/tests/unit/virt/test_netutils.py index de3f4513518..fa0e16df19c 100644 --- a/nova/tests/unit/virt/test_netutils.py +++ b/nova/tests/unit/virt/test_netutils.py @@ -17,6 +17,17 @@ class TestNetUtilsTestCase(test.NoDBTestCase): + + def _get_fake_instance_nw_info(self, num_networks, dhcp_server, mtu): + network_info = fake_network.fake_get_instance_nw_info(self, + num_networks) + for vif in network_info: + for subnet in vif['network']['subnets']: + subnet['meta']['dhcp_server'] = dhcp_server + vif['network']['meta']['mtu'] = mtu + + return network_info + def test_get_cached_vifs_with_vlan_no_nw_info(self): # Make sure that an empty dictionary will be returned when # nw_info is None @@ -39,3 +50,15 @@ def test_get_cached_vifs_with_vlan(self): expected = {'fa:16:3e:d1:28:e4': '2145'} self.assertEqual(expected, netutils.get_cached_vifs_with_vlan(network_info)) + + def test__get_link_mtu(self): + network_info_dhcp = self._get_fake_instance_nw_info( + 1, '192.168.0.100', 9000) + network_info_no_dhcp = self._get_fake_instance_nw_info( + 1, None, 9000) + + for vif in network_info_dhcp: + self.assertIsNone(netutils._get_link_mtu(vif)) + + for vif in network_info_no_dhcp: + self.assertEqual(9000, netutils._get_link_mtu(vif)) diff --git a/nova/virt/netutils.py b/nova/virt/netutils.py index 6ea91e2221e..0ab3ddc4c11 100644 --- a/nova/virt/netutils.py +++ b/nova/virt/netutils.py @@ -263,12 +263,19 @@ def _get_eth_link(vif, ifc_num): 'id': link_id, 'vif_id': vif['id'], 'type': nic_type, - 'mtu': vif['network']['meta'].get('mtu'), + 'mtu': _get_link_mtu(vif), 'ethernet_mac_address': vif.get('address'), } return link +def _get_link_mtu(vif): + for subnet in vif['network']['subnets']: + if subnet['meta'].get('dhcp_server'): + return None + return vif['network']['meta'].get('mtu') + + def _get_nets(vif, subnet, version, net_num, link_id): """Get networks for the given VIF and subnet diff --git a/releasenotes/notes/Do-not-send-mtu-value-in-metadata-for-networks-with-enabled-dhcp-641506f2a13b540f.yaml b/releasenotes/notes/Do-not-send-mtu-value-in-metadata-for-networks-with-enabled-dhcp-641506f2a13b540f.yaml new file mode 100644 index 00000000000..b5232f5ea2f --- /dev/null +++ b/releasenotes/notes/Do-not-send-mtu-value-in-metadata-for-networks-with-enabled-dhcp-641506f2a13b540f.yaml @@ -0,0 +1,5 @@ +--- +other: + - | + For networks which have any subnets with enabled DHCP, MTU value is not send + in the metadata. In such case MTU is configured through the DHCP server. From d9dfd0e0445ae82933eb75a38127186659041673 Mon Sep 17 00:00:00 2001 From: Artom Lifshitz Date: Fri, 28 Oct 2022 19:42:21 -0400 Subject: [PATCH 21/73] Reproduce bug 1995153 If we first boot an instance with NUMA topology on a host, any subsequent attempts to boot instances with the `socket` PCI NUMA policy will fail with `Cannot load 'socket' in the base class`. Demonstrate this in a functional test. Change-Id: I63f4e3dfa38f65b73d0051b8e52b1abd0f027e9b Related-bug: 1995153 (cherry picked from commit 63d6ecd99b7dec06cf0cf8358b43b0d8fa607504) (cherry picked from commit 29e3f2f2ab69157d938cfe6895df352ef9a08d8c) --- .../regressions/test_bug_1995153.py | 109 ++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 nova/tests/functional/regressions/test_bug_1995153.py diff --git a/nova/tests/functional/regressions/test_bug_1995153.py b/nova/tests/functional/regressions/test_bug_1995153.py new file mode 100644 index 00000000000..c897156d991 --- /dev/null +++ b/nova/tests/functional/regressions/test_bug_1995153.py @@ -0,0 +1,109 @@ +# Copyright (C) 2023 Red Hat, Inc +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + + +import fixtures +from unittest import mock + +from oslo_serialization import jsonutils +from oslo_utils import units + +from nova.objects import fields +from nova.tests.fixtures import libvirt as fakelibvirt +from nova.tests.functional import integrated_helpers +from nova.tests.functional.libvirt import base + + +class Bug1995153RegressionTest( + base.ServersTestBase, + integrated_helpers.InstanceHelperMixin +): + + ADDITIONAL_FILTERS = ['NUMATopologyFilter', 'PciPassthroughFilter'] + + ALIAS_NAME = 'a1' + PCI_DEVICE_SPEC = [jsonutils.dumps( + { + 'vendor_id': fakelibvirt.PCI_VEND_ID, + 'product_id': fakelibvirt.PCI_PROD_ID, + } + )] + # we set the numa_affinity policy to required to ensure strict affinity + # between pci devices and the guest cpu and memory will be enforced. + PCI_ALIAS = [jsonutils.dumps( + { + 'vendor_id': fakelibvirt.PCI_VEND_ID, + 'product_id': fakelibvirt.PCI_PROD_ID, + 'name': ALIAS_NAME, + 'device_type': fields.PciDeviceType.STANDARD, + 'numa_policy': fields.PCINUMAAffinityPolicy.REQUIRED, + } + )] + + def setUp(self): + super(Bug1995153RegressionTest, self).setUp() + self.flags( + device_spec=self.PCI_DEVICE_SPEC, + alias=self.PCI_ALIAS, + group='pci' + ) + host_manager = self.scheduler.manager.host_manager + pci_filter_class = host_manager.filter_cls_map['PciPassthroughFilter'] + host_pass_mock = mock.Mock(wraps=pci_filter_class().host_passes) + self.mock_filter = self.useFixture(fixtures.MockPatch( + 'nova.scheduler.filters.pci_passthrough_filter' + '.PciPassthroughFilter.host_passes', + side_effect=host_pass_mock)).mock + + def test_socket_policy_bug_1995153(self): + """The numa_usage_from_instance_numa() method in hardware.py saves the + host NUMAToplogy object with NUMACells that have no `socket` set. This + was an omission in the original implementation of the `socket` PCI NUMA + affinity policy. The consequence is that any code path that calls into + numa_usage_from_instance_numa() will clobber the host NUMA topology in + the database with a socket-less version. Booting an instance with NUMA + toplogy will do that, for example. If then a second instance is booted + with the `socket` PCI NUMA affinity policy, it will read the + socket-less host NUMATopology from the database, and error out with a + NotImplementedError. This is bug 1995153. + """ + host_info = fakelibvirt.HostInfo( + cpu_nodes=2, cpu_sockets=1, cpu_cores=2, cpu_threads=2, + kB_mem=(16 * units.Gi) // units.Ki) + self.flags(cpu_dedicated_set='0-3', group='compute') + pci_info = fakelibvirt.HostPCIDevicesInfo(num_pci=1, numa_node=1) + + self.start_compute(host_info=host_info, pci_info=pci_info) + + extra_spec = { + 'hw:cpu_policy': 'dedicated', + 'pci_passthrough:alias': '%s:1' % self.ALIAS_NAME, + 'hw:pci_numa_affinity_policy': 'socket' + } + # Boot a first instance with a guest NUMA topology to run the buggy + # code in numa_usage_from_instance_numa() and save the socket-less host + # NUMATopology to the database. + self._create_server( + flavor_id=self._create_flavor( + extra_spec={'hw:cpu_policy': 'dedicated'})) + + # FIXME(artom) Attempt to boot an instance with the `socket` PCI NUMA + # affinity policy and observe the fireworks. + flavor_id = self._create_flavor(extra_spec=extra_spec) + server = self._create_server(flavor_id=flavor_id, + expected_state='ERROR') + self.assertIn('fault', server) + self.assertIn('NotImplementedError', server['fault']['message']) + self.assertTrue(self.mock_filter.called) From 693318573cb7bbbcf074c3cc01339f07fe04ff24 Mon Sep 17 00:00:00 2001 From: Artom Lifshitz Date: Fri, 28 Oct 2022 18:09:35 -0400 Subject: [PATCH 22/73] Save cell socket correctly when updating host NUMA topology Previously, in numa_usage_from_instance_numa(), any new NUMACell objects we created did not have the `socket` attribute. In some cases this was persisted all the way down to the database. Fix this by copying `socket` from the old_cell. Change-Id: I9ed3c31ccd3220b02d951fc6dbc5ea049a240a68 Closes-Bug: 1995153 (cherry picked from commit 04ebae9dc01ebd24552b5aacd1a0f8b129013a9e) (cherry picked from commit acb511652c1afb8253c66c29ca10f790f035229e) --- .../regressions/test_bug_1995153.py | 36 +++++++++---------- nova/tests/unit/compute/test_compute.py | 2 ++ .../unit/compute/test_resource_tracker.py | 2 ++ nova/tests/unit/scheduler/fakes.py | 2 ++ nova/tests/unit/virt/test_hardware.py | 29 +++++++++++++++ nova/virt/hardware.py | 1 + 6 files changed, 53 insertions(+), 19 deletions(-) diff --git a/nova/tests/functional/regressions/test_bug_1995153.py b/nova/tests/functional/regressions/test_bug_1995153.py index c897156d991..f4e61d06dfc 100644 --- a/nova/tests/functional/regressions/test_bug_1995153.py +++ b/nova/tests/functional/regressions/test_bug_1995153.py @@ -68,16 +68,17 @@ def setUp(self): side_effect=host_pass_mock)).mock def test_socket_policy_bug_1995153(self): - """The numa_usage_from_instance_numa() method in hardware.py saves the - host NUMAToplogy object with NUMACells that have no `socket` set. This - was an omission in the original implementation of the `socket` PCI NUMA - affinity policy. The consequence is that any code path that calls into - numa_usage_from_instance_numa() will clobber the host NUMA topology in - the database with a socket-less version. Booting an instance with NUMA - toplogy will do that, for example. If then a second instance is booted - with the `socket` PCI NUMA affinity policy, it will read the - socket-less host NUMATopology from the database, and error out with a - NotImplementedError. This is bug 1995153. + """Previously, the numa_usage_from_instance_numa() method in + hardware.py saved the host NUMAToplogy object with NUMACells that have + no `socket` set. This was an omission in the original implementation of + the `socket` PCI NUMA affinity policy. The consequence was that any + code path that called into numa_usage_from_instance_numa() would + clobber the host NUMA topology in the database with a socket-less + version. Booting an instance with NUMA toplogy would do that, for + example. If then a second instance was booted with the `socket` PCI + NUMA affinity policy, it would read the socket-less host NUMATopology + from the database, and error out with a NotImplementedError. This was + bug 1995153. Demonstrate that this is fixed. """ host_info = fakelibvirt.HostInfo( cpu_nodes=2, cpu_sockets=1, cpu_cores=2, cpu_threads=2, @@ -92,18 +93,15 @@ def test_socket_policy_bug_1995153(self): 'pci_passthrough:alias': '%s:1' % self.ALIAS_NAME, 'hw:pci_numa_affinity_policy': 'socket' } - # Boot a first instance with a guest NUMA topology to run the buggy - # code in numa_usage_from_instance_numa() and save the socket-less host - # NUMATopology to the database. + # Boot a first instance with a guest NUMA topology to run the + # numa_usage_from_instance_numa() and update the host NUMATopology in + # the database. self._create_server( flavor_id=self._create_flavor( extra_spec={'hw:cpu_policy': 'dedicated'})) - # FIXME(artom) Attempt to boot an instance with the `socket` PCI NUMA - # affinity policy and observe the fireworks. + # Boot an instance with the `socket` PCI NUMA affinity policy and + # assert that it boots correctly now. flavor_id = self._create_flavor(extra_spec=extra_spec) - server = self._create_server(flavor_id=flavor_id, - expected_state='ERROR') - self.assertIn('fault', server) - self.assertIn('NotImplementedError', server['fault']['message']) + self._create_server(flavor_id=flavor_id) self.assertTrue(self.mock_filter.called) diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py index 314c29f583d..c0e68f8bb18 100644 --- a/nova/tests/unit/compute/test_compute.py +++ b/nova/tests/unit/compute/test_compute.py @@ -5670,6 +5670,7 @@ def test_confirm_resize_with_numa_topology_and_cpu_pinning( pagesize=2048, cpu_usage=2, memory_usage=0, + socket=0, pinned_cpus=set([1, 2]), siblings=[set([1]), set([2])], mempages=[objects.NUMAPagesTopology( @@ -5685,6 +5686,7 @@ def test_confirm_resize_with_numa_topology_and_cpu_pinning( pagesize=2048, memory_usage=0, cpu_usage=0, + socket=0, siblings=[set([3]), set([4])], mempages=[objects.NUMAPagesTopology( size_kb=2048, total=256, used=0)]) diff --git a/nova/tests/unit/compute/test_resource_tracker.py b/nova/tests/unit/compute/test_resource_tracker.py index b81d7365d25..9804a2691f0 100644 --- a/nova/tests/unit/compute/test_resource_tracker.py +++ b/nova/tests/unit/compute/test_resource_tracker.py @@ -179,6 +179,7 @@ memory=_2MB, cpu_usage=0, memory_usage=0, + socket=0, mempages=[_NUMA_PAGE_TOPOLOGIES['2mb*1024']], siblings=[set([1]), set([2])], pinned_cpus=set()), @@ -189,6 +190,7 @@ memory=_2MB, cpu_usage=0, memory_usage=0, + socket=0, mempages=[_NUMA_PAGE_TOPOLOGIES['2mb*1024']], siblings=[set([3]), set([4])], pinned_cpus=set())]), diff --git a/nova/tests/unit/scheduler/fakes.py b/nova/tests/unit/scheduler/fakes.py index 658c82c20eb..f5dcf87e4ab 100644 --- a/nova/tests/unit/scheduler/fakes.py +++ b/nova/tests/unit/scheduler/fakes.py @@ -34,6 +34,7 @@ memory=512, cpu_usage=0, memory_usage=0, + socket=0, pinned_cpus=set(), mempages=[ objects.NUMAPagesTopology(size_kb=16, total=387184, used=0), @@ -46,6 +47,7 @@ memory=512, cpu_usage=0, memory_usage=0, + socket=0, pinned_cpus=set(), mempages=[ objects.NUMAPagesTopology(size_kb=4, total=1548736, used=0), diff --git a/nova/tests/unit/virt/test_hardware.py b/nova/tests/unit/virt/test_hardware.py index 26ec198f08c..e9a5666e8c1 100644 --- a/nova/tests/unit/virt/test_hardware.py +++ b/nova/tests/unit/virt/test_hardware.py @@ -2023,6 +2023,7 @@ def test_host_usage_contiguous(self): memory=256, cpu_usage=0, memory_usage=0, + socket=0, pinned_cpus=set(), mempages=[ objects.NUMAPagesTopology(size_kb=4, total=32768, used=0), @@ -2036,6 +2037,7 @@ def test_host_usage_contiguous(self): memory=256, cpu_usage=0, memory_usage=0, + socket=1, pinned_cpus=set(), mempages=[ objects.NUMAPagesTopology(size_kb=4, total=32768, used=64), @@ -2049,6 +2051,7 @@ def test_host_usage_contiguous(self): memory=2, cpu_usage=0, memory_usage=0, + socket=2, pinned_cpus=set(), mempages=[ objects.NUMAPagesTopology(size_kb=4, total=512, used=16)], @@ -2130,6 +2133,7 @@ def test_host_usage_contiguous_pages_compute(self): memory=160, cpu_usage=0, memory_usage=0, + socket=0, pinned_cpus=set(), mempages=[ objects.NUMAPagesTopology(size_kb=4, total=32768, used=32), @@ -2170,6 +2174,7 @@ def test_host_usage_sparse(self): memory=1024, cpu_usage=0, memory_usage=0, + socket=0, pinned_cpus=set(), mempages=[ objects.NUMAPagesTopology(size_kb=4, total=512, used=0)], @@ -2181,6 +2186,7 @@ def test_host_usage_sparse(self): memory=512, cpu_usage=0, memory_usage=0, + socket=0, pinned_cpus=set(), mempages=[ objects.NUMAPagesTopology(size_kb=4, total=512, used=0)], @@ -2192,6 +2198,7 @@ def test_host_usage_sparse(self): memory=512, cpu_usage=0, memory_usage=0, + socket=0, pinned_cpus=set(), mempages=[ objects.NUMAPagesTopology(size_kb=4, total=512, used=0)], @@ -2258,6 +2265,7 @@ def test_host_usage_cumulative_with_free(self): memory=1024, cpu_usage=2, memory_usage=512, + socket=0, mempages=[ objects.NUMAPagesTopology(size_kb=4, total=512, used=0)], siblings=[set([0]), set([1]), set([2]), set([3])], @@ -2269,6 +2277,7 @@ def test_host_usage_cumulative_with_free(self): memory=512, cpu_usage=1, memory_usage=512, + socket=0, pinned_cpus=set(), mempages=[ objects.NUMAPagesTopology(size_kb=4, total=512, used=0)], @@ -2280,6 +2289,7 @@ def test_host_usage_cumulative_with_free(self): memory=256, cpu_usage=0, memory_usage=0, + socket=0, pinned_cpus=set(), mempages=[ objects.NUMAPagesTopology(size_kb=4, total=512, used=0)], @@ -2330,6 +2340,7 @@ def _topo_usage_reserved_page_size(self): memory=512, cpu_usage=0, memory_usage=0, + socket=0, pinned_cpus=set(), mempages=[objects.NUMAPagesTopology( size_kb=2048, total=512, used=128, @@ -2342,6 +2353,7 @@ def _topo_usage_reserved_page_size(self): memory=512, cpu_usage=0, memory_usage=0, + socket=0, pinned_cpus=set(), mempages=[objects.NUMAPagesTopology( size_kb=1048576, total=5, used=2, @@ -2606,6 +2618,7 @@ def setUp(self): memory=2048, cpu_usage=2, memory_usage=2048, + socket=0, pinned_cpus=set(), mempages=[objects.NUMAPagesTopology( size_kb=4, total=524288, used=0)], @@ -2616,6 +2629,7 @@ def setUp(self): memory=2048, cpu_usage=2, memory_usage=2048, + socket=0, pinned_cpus=set(), mempages=[objects.NUMAPagesTopology( size_kb=4, total=524288, used=0)], @@ -4160,6 +4174,7 @@ def test_cpu_pinning_usage_from_instances(self): memory=4096, cpu_usage=0, memory_usage=0, + socket=0, pinned_cpus=set(), siblings=[set([0]), set([1]), set([2]), set([3])], mempages=[objects.NUMAPagesTopology( @@ -4189,6 +4204,7 @@ def test_cpu_pinning_usage_from_instances_free(self): memory=4096, cpu_usage=0, memory_usage=0, + socket=0, pinned_cpus=set([0, 1, 3]), mempages=[objects.NUMAPagesTopology( size_kb=4, total=524288, used=0)], @@ -4218,6 +4234,7 @@ def test_host_usage_from_instances_fail(self): memory=4096, cpu_usage=0, memory_usage=0, + socket=0, pinned_cpus=set(), siblings=[set([0]), set([1]), set([2]), set([3])], mempages=[objects.NUMAPagesTopology( @@ -4246,6 +4263,7 @@ def test_host_usage_from_instances_isolate(self): memory=4096, cpu_usage=0, memory_usage=0, + socket=0, pinned_cpus=set(), siblings=[set([0, 2]), set([1, 3])], mempages=[objects.NUMAPagesTopology( @@ -4272,6 +4290,7 @@ def test_host_usage_from_instances_isolate_free(self): memory=4096, cpu_usage=0, memory_usage=0, + socket=0, pinned_cpus=set([0, 1, 2, 3]), siblings=[set([0, 2]), set([1, 3])], mempages=[objects.NUMAPagesTopology( @@ -4298,6 +4317,7 @@ def test_host_usage_from_instances_isolated_without_siblings(self): memory=4096, cpu_usage=0, memory_usage=0, + socket=0, pinned_cpus=set(), siblings=[set([0]), set([1]), set([2]), set([3])], mempages=[objects.NUMAPagesTopology( @@ -4324,6 +4344,7 @@ def test_host_usage_from_instances_isolated_without_siblings_free(self): memory=4096, cpu_usage=0, memory_usage=0, + socket=0, pinned_cpus=set([0, 1, 2, 3]), siblings=[set([0]), set([1]), set([2]), set([3])], mempages=[objects.NUMAPagesTopology( @@ -4353,6 +4374,7 @@ def test_host_usage_from_mixed_instance(self): memory=4096, cpu_usage=0, memory_usage=0, + socket=0, pinned_cpus=set([2]), siblings=[set([0, 4]), set([1, 5]), set([2, 6]), set([3, 7])], mempages=[objects.NUMAPagesTopology( @@ -4383,6 +4405,7 @@ def test_host_usage_from_mixed_instance_free(self): memory=4096, cpu_usage=2, memory_usage=0, + socket=0, pinned_cpus=set([2, 6, 7]), siblings=[set([0, 4]), set([1, 5]), set([2, 6]), set([3, 7])], mempages=[objects.NUMAPagesTopology( @@ -4415,6 +4438,7 @@ def test_host_usage_from_mixed_instance_emu_isolate(self): cpu_usage=2, memory_usage=0, pinned_cpus=set(), + socket=0, siblings=[{cpu} for cpu in range(8)], mempages=[objects.NUMAPagesTopology( size_kb=4, total=524288, used=0)] @@ -4448,6 +4472,7 @@ def test_host_usage_from_mixed_instance_emu_isolate_free(self): memory=4096, cpu_usage=2, memory_usage=0, + socket=0, pinned_cpus=set([0, 1, 2, 3]), siblings=[{cpu} for cpu in range(8)], mempages=[objects.NUMAPagesTopology( @@ -4490,6 +4515,7 @@ def test_host_usage_from_mixed_instance_cpu_isolate(self): memory=4096, cpu_usage=2, memory_usage=0, + socket=0, pinned_cpus=set(), siblings=[set([0, 5]), set([1, 6]), set([2, 7]), set([3, 8]), set([4, 9])], @@ -4529,6 +4555,7 @@ def test_host_usage_from_mixed_instance_cpu_isolate_free(self): memory=4096, cpu_usage=2, memory_usage=0, + socket=0, pinned_cpus=set([0, 1, 2, 5, 6, 7]), siblings=[set([0, 5]), set([1, 6]), set([2, 7]), set([3, 8]), set([4, 9])], @@ -4764,6 +4791,7 @@ def _host_topology(): memory=2048, cpu_usage=0, memory_usage=0, + socket=0, pinned_cpus=set(), siblings=[set([0]), set([1])], mempages=[objects.NUMAPagesTopology( @@ -4775,6 +4803,7 @@ def _host_topology(): memory=2048, cpu_usage=0, memory_usage=0, + socket=0, pinned_cpus=set(), siblings=[set([2]), set([3])], mempages=[objects.NUMAPagesTopology( diff --git a/nova/virt/hardware.py b/nova/virt/hardware.py index 271a719aa29..3688b57d35f 100644 --- a/nova/virt/hardware.py +++ b/nova/virt/hardware.py @@ -2566,6 +2566,7 @@ def numa_usage_from_instance_numa(host_topology, instance_topology, cpuset=host_cell.cpuset, pcpuset=host_cell.pcpuset, memory=host_cell.memory, + socket=host_cell.socket, cpu_usage=0, memory_usage=0, mempages=host_cell.mempages, From 8b4b99149a35663fc11d7d163082747b1b210b4d Mon Sep 17 00:00:00 2001 From: melanie witt Date: Wed, 15 Feb 2023 22:37:40 +0000 Subject: [PATCH 23/73] Use force=True for os-brick disconnect during delete The 'force' parameter of os-brick's disconnect_volume() method allows callers to ignore flushing errors and ensure that devices are being removed from the host. We should use force=True when we are going to delete an instance to avoid leaving leftover devices connected to the compute host which could then potentially be reused to map to volumes to an instance that should not have access to those volumes. We can use force=True even when disconnecting a volume that will not be deleted on termination because os-brick will always attempt to flush and disconnect gracefully before forcefully removing devices. Closes-Bug: #2004555 Change-Id: I3629b84d3255a8fe9d8a7cea8c6131d7c40899e8 (cherry picked from commit db455548a12beac1153ce04eca5e728d7b773901) (cherry picked from commit efb01985db88d6333897018174649b425feaa1b4) --- .../admin/configuration/cross-cell-resize.rst | 2 +- doc/source/admin/configuration/index.rst | 1 + .../configuration/service-user-token.rst | 59 +++++++++++++++++ doc/source/admin/live-migration-usage.rst | 2 +- .../admin/migrate-instance-with-snapshot.rst | 2 +- doc/source/admin/support-compute.rst | 64 ------------------- doc/source/install/compute-install-obs.rst | 20 ++++++ doc/source/install/compute-install-rdo.rst | 20 ++++++ doc/source/install/compute-install-ubuntu.rst | 20 ++++++ doc/source/install/controller-install-obs.rst | 20 ++++++ doc/source/install/controller-install-rdo.rst | 20 ++++++ .../install/controller-install-ubuntu.rst | 20 ++++++ nova/cmd/status.py | 11 ++++ nova/tests/unit/cmd/test_status.py | 16 +++++ nova/tests/unit/virt/hyperv/test_vmops.py | 2 +- nova/tests/unit/virt/hyperv/test_volumeops.py | 26 ++++++-- nova/tests/unit/virt/libvirt/test_driver.py | 61 ++++++++++++++++-- .../virt/libvirt/volume/test_fibrechannel.py | 20 ++++++ .../unit/virt/libvirt/volume/test_iscsi.py | 9 +++ .../unit/virt/libvirt/volume/test_lightos.py | 8 ++- .../unit/virt/libvirt/volume/test_nvme.py | 8 ++- .../unit/virt/libvirt/volume/test_scaleio.py | 8 ++- .../unit/virt/libvirt/volume/test_storpool.py | 16 ++++- .../virt/libvirt/volume/test_vzstorage.py | 8 ++- nova/virt/hyperv/vmops.py | 2 +- nova/virt/hyperv/volumeops.py | 12 ++-- nova/virt/libvirt/driver.py | 7 +- nova/virt/libvirt/volume/fibrechannel.py | 7 +- nova/virt/libvirt/volume/fs.py | 2 +- nova/virt/libvirt/volume/iscsi.py | 7 +- nova/virt/libvirt/volume/lightos.py | 7 +- nova/virt/libvirt/volume/nvme.py | 6 +- nova/virt/libvirt/volume/quobyte.py | 2 +- nova/virt/libvirt/volume/scaleio.py | 7 +- nova/virt/libvirt/volume/smbfs.py | 2 +- nova/virt/libvirt/volume/storpool.py | 5 +- nova/virt/libvirt/volume/volume.py | 2 +- nova/virt/libvirt/volume/vzstorage.py | 5 +- .../service-user-token-421d067c16257782.yaml | 11 ++++ 39 files changed, 413 insertions(+), 114 deletions(-) create mode 100644 doc/source/admin/configuration/service-user-token.rst create mode 100644 releasenotes/notes/service-user-token-421d067c16257782.yaml diff --git a/doc/source/admin/configuration/cross-cell-resize.rst b/doc/source/admin/configuration/cross-cell-resize.rst index e51e4257748..0c34fd13f51 100644 --- a/doc/source/admin/configuration/cross-cell-resize.rst +++ b/doc/source/admin/configuration/cross-cell-resize.rst @@ -284,7 +284,7 @@ Troubleshooting Timeouts ~~~~~~~~ -Configure a :ref:`service user ` in case the user token +Configure a :ref:`service user ` in case the user token times out, e.g. during the snapshot and download of a large server image. If RPC calls are timing out with a ``MessagingTimeout`` error in the logs, diff --git a/doc/source/admin/configuration/index.rst b/doc/source/admin/configuration/index.rst index 233597b1fe4..f5b6fde9dac 100644 --- a/doc/source/admin/configuration/index.rst +++ b/doc/source/admin/configuration/index.rst @@ -19,6 +19,7 @@ A list of config options based on different topics can be found below: .. toctree:: :maxdepth: 1 + /admin/configuration/service-user-token /admin/configuration/api /admin/configuration/resize /admin/configuration/cross-cell-resize diff --git a/doc/source/admin/configuration/service-user-token.rst b/doc/source/admin/configuration/service-user-token.rst new file mode 100644 index 00000000000..740730af1d0 --- /dev/null +++ b/doc/source/admin/configuration/service-user-token.rst @@ -0,0 +1,59 @@ +.. _service_user_token: + +=================== +Service User Tokens +=================== + +.. note:: + + Configuration of service user tokens is **required** for every Nova service + for security reasons. See https://bugs.launchpad.net/nova/+bug/2004555 for + details. + +Configure Nova to send service user tokens alongside regular user tokens when +making REST API calls to other services. The identity service (Keystone) will +authenticate a request using the service user token if the regular user token +has expired. + +This is important when long-running operations such as live migration or +snapshot take long enough to exceed the expiry of the user token. Without the +service token, if a long-running operation exceeds the expiry of the user +token, post operations such as cleanup after a live migration could fail when +Nova calls other service APIs like block-storage (Cinder) or networking +(Neutron). + +The service token is also used by services to validate whether the API caller +is a service. Some service APIs are restricted to service users only. + +To set up service tokens, create a ``nova`` service user and ``service`` role +in the identity service (Keystone) and assign the ``service`` role to the +``nova`` service user. + +Then, configure the :oslo.config:group:`service_user` section of the Nova +configuration file, for example: + +.. code-block:: ini + + [service_user] + send_service_user_token = true + auth_url = https://104.130.216.102/identity + auth_strategy = keystone + auth_type = password + project_domain_name = Default + project_name = service + user_domain_name = Default + username = nova + password = secretservice + ... + +And configure the other identity options as necessary for the service user, +much like you would configure nova to work with the image service (Glance) or +networking service (Neutron). + +.. note:: + + Please note that the role assigned to the :oslo.config:group:`service_user` + needs to be in the configured + :oslo.config:option:`keystone_authtoken.service_token_roles` of other + services such as block-storage (Cinder), image (Glance), and networking + (Neutron). diff --git a/doc/source/admin/live-migration-usage.rst b/doc/source/admin/live-migration-usage.rst index 783ab5e27c2..a1e7f187566 100644 --- a/doc/source/admin/live-migration-usage.rst +++ b/doc/source/admin/live-migration-usage.rst @@ -320,4 +320,4 @@ To make live-migration succeed, you have several options: If live migrations routinely timeout or fail during cleanup operations due to the user token timing out, consider configuring nova to use -:ref:`service user tokens `. +:ref:`service user tokens `. diff --git a/doc/source/admin/migrate-instance-with-snapshot.rst b/doc/source/admin/migrate-instance-with-snapshot.rst index 65059679abb..230431091e0 100644 --- a/doc/source/admin/migrate-instance-with-snapshot.rst +++ b/doc/source/admin/migrate-instance-with-snapshot.rst @@ -67,7 +67,7 @@ Create a snapshot of the instance If snapshot operations routinely fail because the user token times out while uploading a large disk image, consider configuring nova to use - :ref:`service user tokens `. + :ref:`service user tokens `. #. Use the :command:`openstack image list` command to check the status until the status is ``ACTIVE``: diff --git a/doc/source/admin/support-compute.rst b/doc/source/admin/support-compute.rst index 8522e51d795..31e32fd1ddc 100644 --- a/doc/source/admin/support-compute.rst +++ b/doc/source/admin/support-compute.rst @@ -478,67 +478,3 @@ Ensure the ``compute`` endpoint in the identity service catalog is pointing at ``/v2.1`` instead of ``/v2``. The former route supports microversions, while the latter route is considered the legacy v2.0 compatibility-mode route which renders all requests as if they were made on the legacy v2.0 API. - - -.. _user_token_timeout: - -User token times out during long-running operations ---------------------------------------------------- - -Problem -~~~~~~~ - -Long-running operations such as live migration or snapshot can sometimes -overrun the expiry of the user token. In such cases, post operations such -as cleaning up after a live migration can fail when the nova-compute service -needs to cleanup resources in other services, such as in the block-storage -(cinder) or networking (neutron) services. - -For example: - -.. code-block:: console - - 2018-12-17 13:47:29.591 16987 WARNING nova.virt.libvirt.migration [req-7bc758de-b2e4-461b-a971-f79be6cd4703 313d1247d7b845da9c731eec53e50a26 2f693c782fa748c2baece8db95b4ba5b - default default] [instance: ead8ecc3-f473-4672-a67b-c44534c6042d] Live migration not completed after 2400 sec - 2018-12-17 13:47:30.097 16987 WARNING nova.virt.libvirt.driver [req-7bc758de-b2e4-461b-a971-f79be6cd4703 313d1247d7b845da9c731eec53e50a26 2f693c782fa748c2baece8db95b4ba5b - default default] [instance: ead8ecc3-f473-4672-a67b-c44534c6042d] Migration operation was cancelled - 2018-12-17 13:47:30.299 16987 ERROR nova.virt.libvirt.driver [req-7bc758de-b2e4-461b-a971-f79be6cd4703 313d1247d7b845da9c731eec53e50a26 2f693c782fa748c2baece8db95b4ba5b - default default] [instance: ead8ecc3-f473-4672-a67b-c44534c6042d] Live Migration failure: operation aborted: migration job: canceled by client: libvirtError: operation aborted: migration job: canceled by client - 2018-12-17 13:47:30.685 16987 INFO nova.compute.manager [req-7bc758de-b2e4-461b-a971-f79be6cd4703 313d1247d7b845da9c731eec53e50a26 2f693c782fa748c2baece8db95b4ba5b - default default] [instance: ead8ecc3-f473-4672-a67b-c44534c6042d] Swapping old allocation on 3e32d595-bd1f-4136-a7f4-c6703d2fbe18 held by migration 17bec61d-544d-47e0-a1c1-37f9d7385286 for instance - 2018-12-17 13:47:32.450 16987 ERROR nova.volume.cinder [req-7bc758de-b2e4-461b-a971-f79be6cd4703 313d1247d7b845da9c731eec53e50a26 2f693c782fa748c2baece8db95b4ba5b - default default] Delete attachment failed for attachment 58997d5b-24f0-4073-819e-97916fb1ee19. Error: The request you have made requires authentication. (HTTP 401) Code: 401: Unauthorized: The request you have made requires authentication. (HTTP 401) - -Solution -~~~~~~~~ - -Configure nova to use service user tokens to supplement the regular user token -used to initiate the operation. The identity service (keystone) will then -authenticate a request using the service user token if the user token has -already expired. - -To use, create a service user in the identity service similar as you would when -creating the ``nova`` service user. - -Then configure the :oslo.config:group:`service_user` section of the nova -configuration file, for example: - -.. code-block:: ini - - [service_user] - send_service_user_token = True - auth_type = password - project_domain_name = Default - project_name = service - user_domain_name = Default - password = secretservice - username = nova - auth_url = https://104.130.216.102/identity - ... - -And configure the other identity options as necessary for the service user, -much like you would configure nova to work with the image service (glance) -or networking service. - -.. note:: - - Please note that the role of the :oslo.config:group:`service_user` you - configure needs to be a superset of - :oslo.config:option:`keystone_authtoken.service_token_roles` (The option - :oslo.config:option:`keystone_authtoken.service_token_roles` is configured - in cinder, glance and neutron). diff --git a/doc/source/install/compute-install-obs.rst b/doc/source/install/compute-install-obs.rst index c5c1d29fb3d..c227b6eba43 100644 --- a/doc/source/install/compute-install-obs.rst +++ b/doc/source/install/compute-install-obs.rst @@ -92,6 +92,26 @@ Install and configure components Comment out or remove any other options in the ``[keystone_authtoken]`` section. + * In the ``[service_user]`` section, configure :ref:`service user + tokens `: + + .. path /etc/nova/nova.conf + .. code-block:: ini + + [service_user] + send_service_user_token = true + auth_url = https://controller/identity + auth_strategy = keystone + auth_type = password + project_domain_name = Default + project_name = service + user_domain_name = Default + username = nova + password = NOVA_PASS + + Replace ``NOVA_PASS`` with the password you chose for the ``nova`` user in + the Identity service. + * In the ``[DEFAULT]`` section, configure the ``my_ip`` option: .. path /etc/nova/nova.conf diff --git a/doc/source/install/compute-install-rdo.rst b/doc/source/install/compute-install-rdo.rst index 0a5ad685a62..0c6203a6673 100644 --- a/doc/source/install/compute-install-rdo.rst +++ b/doc/source/install/compute-install-rdo.rst @@ -84,6 +84,26 @@ Install and configure components Comment out or remove any other options in the ``[keystone_authtoken]`` section. + * In the ``[service_user]`` section, configure :ref:`service user + tokens `: + + .. path /etc/nova/nova.conf + .. code-block:: ini + + [service_user] + send_service_user_token = true + auth_url = https://controller/identity + auth_strategy = keystone + auth_type = password + project_domain_name = Default + project_name = service + user_domain_name = Default + username = nova + password = NOVA_PASS + + Replace ``NOVA_PASS`` with the password you chose for the ``nova`` user in + the Identity service. + * In the ``[DEFAULT]`` section, configure the ``my_ip`` option: .. path /etc/nova/nova.conf diff --git a/doc/source/install/compute-install-ubuntu.rst b/doc/source/install/compute-install-ubuntu.rst index 8605c73316e..baf0585e52b 100644 --- a/doc/source/install/compute-install-ubuntu.rst +++ b/doc/source/install/compute-install-ubuntu.rst @@ -74,6 +74,26 @@ Install and configure components Comment out or remove any other options in the ``[keystone_authtoken]`` section. + * In the ``[service_user]`` section, configure :ref:`service user + tokens `: + + .. path /etc/nova/nova.conf + .. code-block:: ini + + [service_user] + send_service_user_token = true + auth_url = https://controller/identity + auth_strategy = keystone + auth_type = password + project_domain_name = Default + project_name = service + user_domain_name = Default + username = nova + password = NOVA_PASS + + Replace ``NOVA_PASS`` with the password you chose for the ``nova`` user in + the Identity service. + * In the ``[DEFAULT]`` section, configure the ``my_ip`` option: .. path /etc/nova/nova.conf diff --git a/doc/source/install/controller-install-obs.rst b/doc/source/install/controller-install-obs.rst index 18499612c3e..01b7bb0f5ab 100644 --- a/doc/source/install/controller-install-obs.rst +++ b/doc/source/install/controller-install-obs.rst @@ -260,6 +260,26 @@ Install and configure components Comment out or remove any other options in the ``[keystone_authtoken]`` section. + * In the ``[service_user]`` section, configure :ref:`service user + tokens `: + + .. path /etc/nova/nova.conf + .. code-block:: ini + + [service_user] + send_service_user_token = true + auth_url = https://controller/identity + auth_strategy = keystone + auth_type = password + project_domain_name = Default + project_name = service + user_domain_name = Default + username = nova + password = NOVA_PASS + + Replace ``NOVA_PASS`` with the password you chose for the ``nova`` user in + the Identity service. + * In the ``[DEFAULT]`` section, configure the ``my_ip`` option to use the management interface IP address of the controller node: diff --git a/doc/source/install/controller-install-rdo.rst b/doc/source/install/controller-install-rdo.rst index fd2419631ec..b6098f1776b 100644 --- a/doc/source/install/controller-install-rdo.rst +++ b/doc/source/install/controller-install-rdo.rst @@ -247,6 +247,26 @@ Install and configure components Comment out or remove any other options in the ``[keystone_authtoken]`` section. + * In the ``[service_user]`` section, configure :ref:`service user + tokens `: + + .. path /etc/nova/nova.conf + .. code-block:: ini + + [service_user] + send_service_user_token = true + auth_url = https://controller/identity + auth_strategy = keystone + auth_type = password + project_domain_name = Default + project_name = service + user_domain_name = Default + username = nova + password = NOVA_PASS + + Replace ``NOVA_PASS`` with the password you chose for the ``nova`` user in + the Identity service. + * In the ``[DEFAULT]`` section, configure the ``my_ip`` option to use the management interface IP address of the controller node: diff --git a/doc/source/install/controller-install-ubuntu.rst b/doc/source/install/controller-install-ubuntu.rst index 7282b0b2e22..1363a98ba8b 100644 --- a/doc/source/install/controller-install-ubuntu.rst +++ b/doc/source/install/controller-install-ubuntu.rst @@ -237,6 +237,26 @@ Install and configure components Comment out or remove any other options in the ``[keystone_authtoken]`` section. + * In the ``[service_user]`` section, configure :ref:`service user + tokens `: + + .. path /etc/nova/nova.conf + .. code-block:: ini + + [service_user] + send_service_user_token = true + auth_url = https://controller/identity + auth_strategy = keystone + auth_type = password + project_domain_name = Default + project_name = service + user_domain_name = Default + username = nova + password = NOVA_PASS + + Replace ``NOVA_PASS`` with the password you chose for the ``nova`` user in + the Identity service. + * In the ``[DEFAULT]`` section, configure the ``my_ip`` option to use the management interface IP address of the controller node: diff --git a/nova/cmd/status.py b/nova/cmd/status.py index 29e4a5d01ee..4a4e28d7e89 100644 --- a/nova/cmd/status.py +++ b/nova/cmd/status.py @@ -271,6 +271,15 @@ def _check_machine_type_set(self): return upgradecheck.Result(upgradecheck.Code.SUCCESS) + def _check_service_user_token(self): + if not CONF.service_user.send_service_user_token: + msg = (_(""" +Service user token configuration is required for all Nova services. +For more details see the following: +https://docs.openstack.org/latest/nova/admin/configuration/service-user-token.html""")) # noqa + return upgradecheck.Result(upgradecheck.Code.FAILURE, msg) + return upgradecheck.Result(upgradecheck.Code.SUCCESS) + # The format of the check functions is to return an upgradecheck.Result # object with the appropriate upgradecheck.Code and details set. If the # check hits warnings or failures then those should be stored in the @@ -294,6 +303,8 @@ def _check_machine_type_set(self): (_('Older than N-1 computes'), _check_old_computes), # Added in Wallaby (_('hw_machine_type unset'), _check_machine_type_set), + # Added in Bobcat + (_('Service User Token Configuration'), _check_service_user_token), ) diff --git a/nova/tests/unit/cmd/test_status.py b/nova/tests/unit/cmd/test_status.py index f5fcc168ee2..c6a0ab2d521 100644 --- a/nova/tests/unit/cmd/test_status.py +++ b/nova/tests/unit/cmd/test_status.py @@ -446,3 +446,19 @@ def test_instances_not_found_without_hw_machine_type(self): upgradecheck.Code.SUCCESS, result.code ) + + +class TestUpgradeCheckServiceUserToken(test.NoDBTestCase): + + def setUp(self): + super().setUp() + self.cmd = status.UpgradeCommands() + + def test_service_user_token_not_configured(self): + result = self.cmd._check_service_user_token() + self.assertEqual(upgradecheck.Code.FAILURE, result.code) + + def test_service_user_token_configured(self): + self.flags(send_service_user_token=True, group='service_user') + result = self.cmd._check_service_user_token() + self.assertEqual(upgradecheck.Code.SUCCESS, result.code) diff --git a/nova/tests/unit/virt/hyperv/test_vmops.py b/nova/tests/unit/virt/hyperv/test_vmops.py index 07e1774f9a7..1e3e50f92b2 100644 --- a/nova/tests/unit/virt/hyperv/test_vmops.py +++ b/nova/tests/unit/virt/hyperv/test_vmops.py @@ -1129,7 +1129,7 @@ def test_destroy(self, mock_unplug_vifs, mock_power_off, mock_unplug_vifs.assert_called_once_with( mock_instance, mock.sentinel.fake_network_info) mock_disconnect_volumes.assert_called_once_with( - mock.sentinel.FAKE_BD_INFO) + mock.sentinel.FAKE_BD_INFO, force=True) mock_delete_disk_files.assert_called_once_with( mock_instance.name) diff --git a/nova/tests/unit/virt/hyperv/test_volumeops.py b/nova/tests/unit/virt/hyperv/test_volumeops.py index 66d2c2527f5..f289d036320 100644 --- a/nova/tests/unit/virt/hyperv/test_volumeops.py +++ b/nova/tests/unit/virt/hyperv/test_volumeops.py @@ -141,7 +141,13 @@ def test_disconnect_volumes(self, mock_get_volume_driver): self._volumeops.disconnect_volumes(block_device_info) fake_volume_driver.disconnect_volume.assert_called_once_with( - block_device_mapping[0]['connection_info']) + block_device_mapping[0]['connection_info'], force=False) + + # Verify force=True + fake_volume_driver.disconnect_volume.reset_mock() + self._volumeops.disconnect_volumes(block_device_info, force=True) + fake_volume_driver.disconnect_volume.assert_called_once_with( + block_device_mapping[0]['connection_info'], force=True) @mock.patch('time.sleep') @mock.patch.object(volumeops.VolumeOps, '_get_volume_driver') @@ -181,7 +187,7 @@ def _test_attach_volume(self, mock_get_volume_driver, mock_sleep, if attach_failed: fake_volume_driver.disconnect_volume.assert_called_once_with( - fake_conn_info) + fake_conn_info, force=False) mock_sleep.assert_has_calls( [mock.call(CONF.hyperv.volume_attach_retry_interval)] * CONF.hyperv.volume_attach_retry_count) @@ -203,7 +209,13 @@ def test_disconnect_volume(self, mock_get_volume_driver): mock_get_volume_driver.assert_called_once_with( mock.sentinel.conn_info) fake_volume_driver.disconnect_volume.assert_called_once_with( - mock.sentinel.conn_info) + mock.sentinel.conn_info, force=False) + + # Verify force=True + fake_volume_driver.disconnect_volume.reset_mock() + self._volumeops.disconnect_volume(mock.sentinel.conn_info, force=True) + fake_volume_driver.disconnect_volume.assert_called_once_with( + mock.sentinel.conn_info, force=True) @mock.patch.object(volumeops.VolumeOps, '_get_volume_driver') def test_detach_volume(self, mock_get_volume_driver): @@ -347,7 +359,13 @@ def test_disconnect_volume(self): self._base_vol_driver.disconnect_volume(conn_info) self._conn.disconnect_volume.assert_called_once_with( - conn_info['data']) + conn_info['data'], force=False) + + # Verify force=True + self._conn.disconnect_volume.reset_mock() + self._base_vol_driver.disconnect_volume(conn_info, force=True) + self._conn.disconnect_volume.assert_called_once_with( + conn_info['data'], force=True) @mock.patch.object(volumeops.BaseVolumeDriver, '_get_disk_res_path') def _test_get_disk_resource_path_by_conn_info(self, diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index 86e3661a343..9d0a8709a47 100644 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -9584,7 +9584,7 @@ def test_disconnect_multiattach_single_connection( drvr._disconnect_volume( self.context, fake_connection_info, fake_instance_1) mock_volume_driver.disconnect_volume.assert_called_once_with( - fake_connection_info, fake_instance_1) + fake_connection_info, fake_instance_1, force=False) @mock.patch.object(libvirt_driver.LibvirtDriver, '_detach_encryptor') @mock.patch('nova.objects.InstanceList.get_uuids_by_host') @@ -9958,7 +9958,12 @@ def test_detach_volume_order_with_encryptors(self, mock_get_guest, device_name='vdc', ), mock.call.detach_encryptor(**encryption), - mock.call.disconnect_volume(connection_info, instance)]) + mock.call.disconnect_volume( + connection_info, + instance, + force=False, + ) + ]) get_device_conf_func = mock_detach_with_retry.mock_calls[0][1][2] self.assertEqual(mock_guest.get_disk, get_device_conf_func.func) self.assertEqual(('vdc',), get_device_conf_func.args) @@ -20257,16 +20262,64 @@ def test_cleanup_destroy_secrets(self, mock_disconnect_volume): self.context, mock.sentinel.connection_info, instance, - destroy_secrets=False + destroy_secrets=False, + force=True ), mock.call( self.context, mock.sentinel.connection_info, instance, - destroy_secrets=True + destroy_secrets=True, + force=True ) ]) + @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_volume_driver') + @mock.patch( + 'nova.virt.libvirt.driver.LibvirtDriver._should_disconnect_target', + new=mock.Mock(return_value=True)) + @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._detach_encryptor', + new=mock.Mock()) + @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._undefine_domain', + new=mock.Mock()) + @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_vpmems', + new=mock.Mock(return_value=None)) + def test_cleanup_disconnect_volume(self, mock_vol_driver): + """Verify that we call disconnect_volume() with force=True + + cleanup() is called by destroy() when an instance is being deleted and + force=True should be passed down to os-brick's disconnect_volume() + call, which will ensure removal of devices regardless of errors. + + We need to ensure that devices are removed when an instance is being + deleted to avoid leaving leftover devices that could later be + erroneously connected by external entities (example: multipathd) to + instances that should not have access to the volumes. + + See https://bugs.launchpad.net/nova/+bug/2004555 for details. + """ + connection_info = mock.MagicMock() + block_device_info = { + 'block_device_mapping': [ + { + 'connection_info': connection_info + } + ] + } + instance = objects.Instance(self.context, **self.test_instance) + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI()) + + drvr.cleanup( + self.context, + instance, + network_info={}, + block_device_info=block_device_info, + destroy_vifs=False, + destroy_disks=False, + ) + mock_vol_driver.return_value.disconnect_volume.assert_called_once_with( + connection_info, instance, force=True) + @mock.patch.object(libvirt_driver.LibvirtDriver, '_get_volume_encryption') @mock.patch.object(libvirt_driver.LibvirtDriver, '_allow_native_luksv1') def test_swap_volume_native_luks_blocked(self, mock_allow_native_luksv1, diff --git a/nova/tests/unit/virt/libvirt/volume/test_fibrechannel.py b/nova/tests/unit/virt/libvirt/volume/test_fibrechannel.py index 06065322f6b..55054652c34 100644 --- a/nova/tests/unit/virt/libvirt/volume/test_fibrechannel.py +++ b/nova/tests/unit/virt/libvirt/volume/test_fibrechannel.py @@ -81,3 +81,23 @@ def test_extend_volume(self): self.assertEqual(requested_size, new_size) libvirt_driver.connector.extend_volume.assert_called_once_with( connection_info['data']) + + def test_disconnect_volume(self): + device_path = '/dev/fake-dev' + connection_info = {'data': {'device_path': device_path}} + + libvirt_driver = fibrechannel.LibvirtFibreChannelVolumeDriver( + self.fake_host) + libvirt_driver.connector.disconnect_volume = mock.MagicMock() + libvirt_driver.disconnect_volume( + connection_info, mock.sentinel.instance) + + libvirt_driver.connector.disconnect_volume.assert_called_once_with( + connection_info['data'], connection_info['data'], force=False) + + # Verify force=True + libvirt_driver.connector.disconnect_volume.reset_mock() + libvirt_driver.disconnect_volume( + connection_info, mock.sentinel.instance, force=True) + libvirt_driver.connector.disconnect_volume.assert_called_once_with( + connection_info['data'], connection_info['data'], force=True) diff --git a/nova/tests/unit/virt/libvirt/volume/test_iscsi.py b/nova/tests/unit/virt/libvirt/volume/test_iscsi.py index bd516b1dd6f..a1111e0d121 100644 --- a/nova/tests/unit/virt/libvirt/volume/test_iscsi.py +++ b/nova/tests/unit/virt/libvirt/volume/test_iscsi.py @@ -57,10 +57,19 @@ def test_libvirt_iscsi_driver_disconnect_volume_with_devicenotfound(self, device=device_path)) libvirt_driver.disconnect_volume(connection_info, mock.sentinel.instance) + libvirt_driver.connector.disconnect_volume.assert_called_once_with( + connection_info['data'], None, force=False) msg = mock_LOG_warning.call_args_list[0] self.assertIn('Ignoring VolumeDeviceNotFound', msg[0][0]) + # Verify force=True + libvirt_driver.connector.disconnect_volume.reset_mock() + libvirt_driver.disconnect_volume( + connection_info, mock.sentinel.instance, force=True) + libvirt_driver.connector.disconnect_volume.assert_called_once_with( + connection_info['data'], None, force=True) + def test_extend_volume(self): device_path = '/dev/fake-dev' connection_info = {'data': {'device_path': device_path}} diff --git a/nova/tests/unit/virt/libvirt/volume/test_lightos.py b/nova/tests/unit/virt/libvirt/volume/test_lightos.py index 8a85d730593..f97a696a53f 100644 --- a/nova/tests/unit/virt/libvirt/volume/test_lightos.py +++ b/nova/tests/unit/virt/libvirt/volume/test_lightos.py @@ -62,7 +62,13 @@ def test_libvirt_lightos_driver_disconnect(self): connection_info = {'data': disk_info} lightos_driver.disconnect_volume(connection_info, None) lightos_driver.connector.disconnect_volume.assert_called_once_with( - disk_info, None) + disk_info, None, force=False) + + # Verify force=True + lightos_driver.connector.disconnect_volume.reset_mock() + lightos_driver.disconnect_volume(connection_info, None, force=True) + lightos_driver.connector.disconnect_volume.assert_called_once_with( + disk_info, None, force=True) @mock.patch('os_brick.initiator.connector.InitiatorConnector.factory', new=mock.Mock(return_value=mock.Mock())) diff --git a/nova/tests/unit/virt/libvirt/volume/test_nvme.py b/nova/tests/unit/virt/libvirt/volume/test_nvme.py index 3f593841fae..42ef0adc8de 100644 --- a/nova/tests/unit/virt/libvirt/volume/test_nvme.py +++ b/nova/tests/unit/virt/libvirt/volume/test_nvme.py @@ -77,7 +77,13 @@ def test_libvirt_nvme_driver_disconnect(self): connection_info = {'data': disk_info} nvme_driver.disconnect_volume(connection_info, None) nvme_driver.connector.disconnect_volume.assert_called_once_with( - disk_info, None) + disk_info, None, force=False) + + # Verify force=True + nvme_driver.connector.disconnect_volume.reset_mock() + nvme_driver.disconnect_volume(connection_info, None, force=True) + nvme_driver.connector.disconnect_volume.assert_called_once_with( + disk_info, None, force=True) @mock.patch('os_brick.initiator.connector.InitiatorConnector.factory', new=mock.Mock(return_value=mock.Mock())) diff --git a/nova/tests/unit/virt/libvirt/volume/test_scaleio.py b/nova/tests/unit/virt/libvirt/volume/test_scaleio.py index f0fcba1deb2..7d93691d9d7 100644 --- a/nova/tests/unit/virt/libvirt/volume/test_scaleio.py +++ b/nova/tests/unit/virt/libvirt/volume/test_scaleio.py @@ -49,7 +49,13 @@ def test_libvirt_scaleio_driver_disconnect(self): conn = {'data': mock.sentinel.conn_data} sio.disconnect_volume(conn, mock.sentinel.instance) sio.connector.disconnect_volume.assert_called_once_with( - mock.sentinel.conn_data, None) + mock.sentinel.conn_data, None, force=False) + + # Verify force=True + sio.connector.disconnect_volume.reset_mock() + sio.disconnect_volume(conn, mock.sentinel.instance, force=True) + sio.connector.disconnect_volume.assert_called_once_with( + mock.sentinel.conn_data, None, force=True) @mock.patch('os_brick.initiator.connector.InitiatorConnector.factory', new=mock.Mock(return_value=mock.Mock())) diff --git a/nova/tests/unit/virt/libvirt/volume/test_storpool.py b/nova/tests/unit/virt/libvirt/volume/test_storpool.py index 678d4f8eb47..a3252b85259 100644 --- a/nova/tests/unit/virt/libvirt/volume/test_storpool.py +++ b/nova/tests/unit/virt/libvirt/volume/test_storpool.py @@ -53,9 +53,11 @@ def connect_volume(self, connection_info): } return {'type': 'block', 'path': test_attached[v]['path']} - def disconnect_volume(self, connection_info, device_info): + def disconnect_volume(self, connection_info, device_info, **kwargs): self.inst.assertIn('client_id', connection_info) self.inst.assertIn('volume', connection_info) + self.inst.assertIn('force', kwargs) + self.inst.assertEqual(self.inst.force, kwargs.get('force')) v = connection_info['volume'] if v not in test_attached: @@ -86,6 +88,11 @@ def factory(self, proto, helper): class LibvirtStorPoolVolumeDriverTestCase( test_volume.LibvirtVolumeBaseTestCase): + def setUp(self): + super().setUp() + # This is for testing the force flag of disconnect_volume() + self.force = False + def mock_storpool(f): def _config_inner_inner1(inst, *args, **kwargs): @mock.patch( @@ -175,3 +182,10 @@ def test_storpool_attach_detach_extend(self): libvirt_driver.disconnect_volume(ci_2, mock.sentinel.instance) self.assertDictEqual({}, test_attached) + + # Connect the volume again so we can detach it again + libvirt_driver.connect_volume(ci_2, mock.sentinel.instance) + # Verify force=True + self.force = True + libvirt_driver.disconnect_volume( + ci_2, mock.sentinel.instance, force=True) diff --git a/nova/tests/unit/virt/libvirt/volume/test_vzstorage.py b/nova/tests/unit/virt/libvirt/volume/test_vzstorage.py index 168efee944f..c9e455b193a 100644 --- a/nova/tests/unit/virt/libvirt/volume/test_vzstorage.py +++ b/nova/tests/unit/virt/libvirt/volume/test_vzstorage.py @@ -95,7 +95,13 @@ def test_libvirt_vzstorage_driver_disconnect(self): conn = {'data': mock.sentinel.conn_data} drv.disconnect_volume(conn, mock.sentinel.instance) drv.connector.disconnect_volume.assert_called_once_with( - mock.sentinel.conn_data, None) + mock.sentinel.conn_data, None, force=False) + + # Verify force=True + drv.connector.disconnect_volume.reset_mock() + drv.disconnect_volume(conn, mock.sentinel.instance, force=True) + drv.connector.disconnect_volume.assert_called_once_with( + mock.sentinel.conn_data, None, force=True) def test_libvirt_vzstorage_driver_get_config(self): libvirt_driver = vzstorage.LibvirtVZStorageVolumeDriver(self.fake_host) diff --git a/nova/virt/hyperv/vmops.py b/nova/virt/hyperv/vmops.py index 3ec7e90c306..08adeada761 100644 --- a/nova/virt/hyperv/vmops.py +++ b/nova/virt/hyperv/vmops.py @@ -747,7 +747,7 @@ def destroy(self, instance, network_info, block_device_info, # should be disconnected even if the VM doesn't exist anymore, # so they are not leaked. self.unplug_vifs(instance, network_info) - self._volumeops.disconnect_volumes(block_device_info) + self._volumeops.disconnect_volumes(block_device_info, force=True) if destroy_disks: self._delete_disk_files(instance_name) diff --git a/nova/virt/hyperv/volumeops.py b/nova/virt/hyperv/volumeops.py index da5b40f3751..d2bfed2441e 100644 --- a/nova/virt/hyperv/volumeops.py +++ b/nova/virt/hyperv/volumeops.py @@ -59,10 +59,10 @@ def attach_volumes(self, volumes, instance_name): for vol in volumes: self.attach_volume(vol['connection_info'], instance_name) - def disconnect_volumes(self, block_device_info): + def disconnect_volumes(self, block_device_info, force=False): mapping = driver.block_device_info_get_mapping(block_device_info) for vol in mapping: - self.disconnect_volume(vol['connection_info']) + self.disconnect_volume(vol['connection_info'], force=force) def attach_volume(self, connection_info, instance_name, disk_bus=constants.CTRL_TYPE_SCSI): @@ -116,9 +116,9 @@ def _attach_volume(self, connection_info, instance_name, volume_driver.set_disk_qos_specs(connection_info, qos_specs) - def disconnect_volume(self, connection_info): + def disconnect_volume(self, connection_info, force=False): volume_driver = self._get_volume_driver(connection_info) - volume_driver.disconnect_volume(connection_info) + volume_driver.disconnect_volume(connection_info, force=force) def detach_volume(self, connection_info, instance_name): LOG.debug("Detaching volume: %(connection_info)s " @@ -231,8 +231,8 @@ def _connector(self): def connect_volume(self, connection_info): return self._connector.connect_volume(connection_info['data']) - def disconnect_volume(self, connection_info): - self._connector.disconnect_volume(connection_info['data']) + def disconnect_volume(self, connection_info, force=False): + self._connector.disconnect_volume(connection_info['data'], force=force) def get_disk_resource_path(self, connection_info): disk_paths = self._connector.get_volume_paths(connection_info['data']) diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index e2471074377..1cbeda426c1 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -1644,7 +1644,7 @@ def _cleanup(self, context, instance, network_info, block_device_info=None, try: self._disconnect_volume( context, connection_info, instance, - destroy_secrets=destroy_secrets) + destroy_secrets=destroy_secrets, force=True) except Exception as exc: with excutils.save_and_reraise_exception() as ctxt: if cleanup_instance_disks: @@ -1961,7 +1961,7 @@ def _should_disconnect_target(self, context, instance, multiattach, return (False if connection_count > 1 else True) def _disconnect_volume(self, context, connection_info, instance, - encryption=None, destroy_secrets=True): + encryption=None, destroy_secrets=True, force=False): self._detach_encryptor( context, connection_info, @@ -1973,7 +1973,8 @@ def _disconnect_volume(self, context, connection_info, instance, multiattach = connection_info.get('multiattach', False) if self._should_disconnect_target( context, instance, multiattach, vol_driver, volume_id): - vol_driver.disconnect_volume(connection_info, instance) + vol_driver.disconnect_volume( + connection_info, instance, force=force) else: LOG.info('Detected multiple connections on this host for ' 'volume: %(volume)s, skipping target disconnect.', diff --git a/nova/virt/libvirt/volume/fibrechannel.py b/nova/virt/libvirt/volume/fibrechannel.py index b50db3aa1c0..1f890c95c12 100644 --- a/nova/virt/libvirt/volume/fibrechannel.py +++ b/nova/virt/libvirt/volume/fibrechannel.py @@ -59,7 +59,7 @@ def connect_volume(self, connection_info, instance): connection_info['data']['multipath_id'] = \ device_info['multipath_id'] - def disconnect_volume(self, connection_info, instance): + def disconnect_volume(self, connection_info, instance, force=False): """Detach the volume from instance_name.""" LOG.debug("calling os-brick to detach FC Volume", instance=instance) @@ -69,11 +69,12 @@ def disconnect_volume(self, connection_info, instance): # the 2nd param of disconnect_volume and be consistent # with the rest of the connectors. self.connector.disconnect_volume(connection_info['data'], - connection_info['data']) + connection_info['data'], + force=force) LOG.debug("Disconnected FC Volume", instance=instance) super(LibvirtFibreChannelVolumeDriver, - self).disconnect_volume(connection_info, instance) + self).disconnect_volume(connection_info, instance, force=force) def extend_volume(self, connection_info, instance, requested_size): """Extend the volume.""" diff --git a/nova/virt/libvirt/volume/fs.py b/nova/virt/libvirt/volume/fs.py index 5fb9af4a520..992ef45016e 100644 --- a/nova/virt/libvirt/volume/fs.py +++ b/nova/virt/libvirt/volume/fs.py @@ -116,7 +116,7 @@ def connect_volume(self, connection_info, instance): connection_info['data']['device_path'] = \ self._get_device_path(connection_info) - def disconnect_volume(self, connection_info, instance): + def disconnect_volume(self, connection_info, instance, force=False): """Disconnect the volume.""" vol_name = connection_info['data']['name'] mountpoint = self._get_mount_path(connection_info) diff --git a/nova/virt/libvirt/volume/iscsi.py b/nova/virt/libvirt/volume/iscsi.py index 564bac14cc7..2b25972a495 100644 --- a/nova/virt/libvirt/volume/iscsi.py +++ b/nova/virt/libvirt/volume/iscsi.py @@ -66,19 +66,20 @@ def connect_volume(self, connection_info, instance): connection_info['data']['device_path'] = device_info['path'] - def disconnect_volume(self, connection_info, instance): + def disconnect_volume(self, connection_info, instance, force=False): """Detach the volume from instance_name.""" LOG.debug("calling os-brick to detach iSCSI Volume", instance=instance) try: - self.connector.disconnect_volume(connection_info['data'], None) + self.connector.disconnect_volume( + connection_info['data'], None, force=force) except os_brick_exception.VolumeDeviceNotFound as exc: LOG.warning('Ignoring VolumeDeviceNotFound: %s', exc) return LOG.debug("Disconnected iSCSI Volume", instance=instance) super(LibvirtISCSIVolumeDriver, - self).disconnect_volume(connection_info, instance) + self).disconnect_volume(connection_info, instance, force=force) def extend_volume(self, connection_info, instance, requested_size): """Extend the volume.""" diff --git a/nova/virt/libvirt/volume/lightos.py b/nova/virt/libvirt/volume/lightos.py index d6d393994e5..6a22bf6dc63 100644 --- a/nova/virt/libvirt/volume/lightos.py +++ b/nova/virt/libvirt/volume/lightos.py @@ -42,14 +42,15 @@ def connect_volume(self, connection_info, instance): LOG.debug("Connecting NVMe volume with device_info %s", device_info) connection_info['data']['device_path'] = device_info['path'] - def disconnect_volume(self, connection_info, instance): + def disconnect_volume(self, connection_info, instance, force=False): """Detach the volume from the instance.""" LOG.debug("Disconnecting NVMe disk. instance:%s, volume_id:%s", connection_info.get("instance", ""), connection_info.get("volume_id", "")) - self.connector.disconnect_volume(connection_info['data'], None) + self.connector.disconnect_volume( + connection_info['data'], None, force=force) super(LibvirtLightOSVolumeDriver, self).disconnect_volume( - connection_info, instance) + connection_info, instance, force=force) def extend_volume(self, connection_info, instance, requested_size=None): """Extend the volume.""" diff --git a/nova/virt/libvirt/volume/nvme.py b/nova/virt/libvirt/volume/nvme.py index 74365528122..e2977c3572b 100644 --- a/nova/virt/libvirt/volume/nvme.py +++ b/nova/virt/libvirt/volume/nvme.py @@ -45,13 +45,13 @@ def connect_volume(self, connection_info, instance): connection_info['data']['device_path'] = device_info['path'] - def disconnect_volume(self, connection_info, instance): + def disconnect_volume(self, connection_info, instance, force=False): """Detach the volume from the instance.""" LOG.debug("Disconnecting NVMe disk", instance=instance) self.connector.disconnect_volume( - connection_info['data'], None) + connection_info['data'], None, force=force) super(LibvirtNVMEVolumeDriver, - self).disconnect_volume(connection_info, instance) + self).disconnect_volume(connection_info, instance, force=force) def extend_volume(self, connection_info, instance, requested_size): """Extend the volume.""" diff --git a/nova/virt/libvirt/volume/quobyte.py b/nova/virt/libvirt/volume/quobyte.py index bb7a770e57e..2eb4bcfb428 100644 --- a/nova/virt/libvirt/volume/quobyte.py +++ b/nova/virt/libvirt/volume/quobyte.py @@ -189,7 +189,7 @@ def connect_volume(self, connection_info, instance): instance=instance) @utils.synchronized('connect_qb_volume') - def disconnect_volume(self, connection_info, instance): + def disconnect_volume(self, connection_info, instance, force=False): """Disconnect the volume.""" mount_path = self._get_mount_path(connection_info) diff --git a/nova/virt/libvirt/volume/scaleio.py b/nova/virt/libvirt/volume/scaleio.py index 7c414c2870f..04a9423e8ea 100644 --- a/nova/virt/libvirt/volume/scaleio.py +++ b/nova/virt/libvirt/volume/scaleio.py @@ -57,12 +57,13 @@ def connect_volume(self, connection_info, instance): instance=instance) connection_info['data']['device_path'] = device_info['path'] - def disconnect_volume(self, connection_info, instance): - self.connector.disconnect_volume(connection_info['data'], None) + def disconnect_volume(self, connection_info, instance, force=False): + self.connector.disconnect_volume( + connection_info['data'], None, force=force) LOG.debug("Disconnected volume", instance=instance) super(LibvirtScaleIOVolumeDriver, self).disconnect_volume( - connection_info, instance) + connection_info, instance, force=force) def extend_volume(self, connection_info, instance, requested_size): LOG.debug("calling os-brick to extend ScaleIO Volume", diff --git a/nova/virt/libvirt/volume/smbfs.py b/nova/virt/libvirt/volume/smbfs.py index d112af750cb..9de1ce23cd3 100644 --- a/nova/virt/libvirt/volume/smbfs.py +++ b/nova/virt/libvirt/volume/smbfs.py @@ -52,7 +52,7 @@ def connect_volume(self, connection_info, instance): device_path = self._get_device_path(connection_info) connection_info['data']['device_path'] = device_path - def disconnect_volume(self, connection_info, instance): + def disconnect_volume(self, connection_info, instance, force=False): """Disconnect the volume.""" smbfs_share = connection_info['data']['export'] mount_path = self._get_mount_path(connection_info) diff --git a/nova/virt/libvirt/volume/storpool.py b/nova/virt/libvirt/volume/storpool.py index 0e71221f5b2..e6dffca39a6 100644 --- a/nova/virt/libvirt/volume/storpool.py +++ b/nova/virt/libvirt/volume/storpool.py @@ -47,10 +47,11 @@ def connect_volume(self, connection_info, instance): device_info, instance=instance) connection_info['data']['device_path'] = device_info['path'] - def disconnect_volume(self, connection_info, instance): + def disconnect_volume(self, connection_info, instance, force=False): LOG.debug("Detaching StorPool volume %s", connection_info['data']['volume'], instance=instance) - self.connector.disconnect_volume(connection_info['data'], None) + self.connector.disconnect_volume( + connection_info['data'], None, force=force) LOG.debug("Detached StorPool volume", instance=instance) def extend_volume(self, connection_info, instance, requested_size): diff --git a/nova/virt/libvirt/volume/volume.py b/nova/virt/libvirt/volume/volume.py index 6d650c80e64..f76c3618b27 100644 --- a/nova/virt/libvirt/volume/volume.py +++ b/nova/virt/libvirt/volume/volume.py @@ -135,7 +135,7 @@ def connect_volume(self, connection_info, instance): """Connect the volume.""" pass - def disconnect_volume(self, connection_info, instance): + def disconnect_volume(self, connection_info, instance, force=False): """Disconnect the volume.""" pass diff --git a/nova/virt/libvirt/volume/vzstorage.py b/nova/virt/libvirt/volume/vzstorage.py index 85ffb450765..babfdef55c6 100644 --- a/nova/virt/libvirt/volume/vzstorage.py +++ b/nova/virt/libvirt/volume/vzstorage.py @@ -126,9 +126,10 @@ def _connect_volume(connection_info, instance): return _connect_volume(connection_info, instance) - def disconnect_volume(self, connection_info, instance): + def disconnect_volume(self, connection_info, instance, force=False): """Detach the volume from instance_name.""" LOG.debug("calling os-brick to detach Vzstorage Volume", instance=instance) - self.connector.disconnect_volume(connection_info['data'], None) + self.connector.disconnect_volume( + connection_info['data'], None, force=force) LOG.debug("Disconnected Vzstorage Volume", instance=instance) diff --git a/releasenotes/notes/service-user-token-421d067c16257782.yaml b/releasenotes/notes/service-user-token-421d067c16257782.yaml new file mode 100644 index 00000000000..d3af14fbb85 --- /dev/null +++ b/releasenotes/notes/service-user-token-421d067c16257782.yaml @@ -0,0 +1,11 @@ +upgrade: + - | + Configuration of service user tokens is now **required** for all Nova services + to ensure security of block-storage volume data. + + All Nova configuration files must configure the ``[service_user]`` section as + described in the `documentation`__. + + See https://bugs.launchpad.net/nova/+bug/2004555 for more details. + + __ https://docs.openstack.org/nova/latest/admin/configuration/service-user-token.html From 0d6dd6c67f56c9d4ed36246d14f119da6bca0a5a Mon Sep 17 00:00:00 2001 From: melanie witt Date: Tue, 9 May 2023 03:11:25 +0000 Subject: [PATCH 24/73] Enable use of service user token with admin context When the [service_user] section is configured in nova.conf, nova will have the ability to send a service user token alongside the user's token. The service user token is sent when nova calls other services' REST APIs to authenticate as a service, and service calls can sometimes have elevated privileges. Currently, nova does not however have the ability to send a service user token with an admin context. This means that when nova makes REST API calls to other services with an anonymous admin RequestContext (such as in nova-manage or periodic tasks), it will not be authenticated as a service. This adds a keyword argument to service_auth.get_auth_plugin() to enable callers to provide a user_auth object instead of attempting to extract the user_auth from the RequestContext. The cinder and neutron client modules are also adjusted to make use of the new user_auth keyword argument so that nova calls made with anonymous admin request contexts can authenticate as a service when configured. Related-Bug: #2004555 Change-Id: I14df2d55f4b2f0be58f1a6ad3f19e48f7a6bfcb4 (cherry picked from commit 41c64b94b0af333845e998f6cc195e72ca5ab6bc) (cherry picked from commit 1f781423ee4224c0871ab4aafec191bb2f7ef0e4) --- nova/network/neutron.py | 8 +++++--- nova/service_auth.py | 6 ++++-- nova/tests/unit/network/test_neutron.py | 16 ++++++++++++++++ nova/tests/unit/test_service_auth.py | 10 ++++++++++ nova/tests/unit/volume/test_cinder.py | 11 +++++++++++ nova/volume/cinder.py | 8 +++++--- 6 files changed, 51 insertions(+), 8 deletions(-) diff --git a/nova/network/neutron.py b/nova/network/neutron.py index 27e7d064553..affd76535fc 100644 --- a/nova/network/neutron.py +++ b/nova/network/neutron.py @@ -222,13 +222,15 @@ def _get_auth_plugin(context, admin=False): # support some services (metadata API) where an admin context is used # without an auth token. global _ADMIN_AUTH + user_auth = None if admin or (context.is_admin and not context.auth_token): if not _ADMIN_AUTH: _ADMIN_AUTH = _load_auth_plugin(CONF) - return _ADMIN_AUTH + user_auth = _ADMIN_AUTH - if context.auth_token: - return service_auth.get_auth_plugin(context) + if context.auth_token or user_auth: + # When user_auth = None, user_auth will be extracted from the context. + return service_auth.get_auth_plugin(context, user_auth=user_auth) # We did not get a user token and we should not be using # an admin token so log an error diff --git a/nova/service_auth.py b/nova/service_auth.py index f5ae0646d8a..aa8fd8fa123 100644 --- a/nova/service_auth.py +++ b/nova/service_auth.py @@ -30,8 +30,10 @@ def reset_globals(): _SERVICE_AUTH = None -def get_auth_plugin(context): - user_auth = context.get_auth_plugin() +def get_auth_plugin(context, user_auth=None): + # user_auth may be passed in when the RequestContext is anonymous, such as + # when get_admin_context() is used for API calls by nova-manage. + user_auth = user_auth or context.get_auth_plugin() if CONF.service_user.send_service_user_token: global _SERVICE_AUTH diff --git a/nova/tests/unit/network/test_neutron.py b/nova/tests/unit/network/test_neutron.py index eefa7b974fc..fec66fb2d31 100644 --- a/nova/tests/unit/network/test_neutron.py +++ b/nova/tests/unit/network/test_neutron.py @@ -142,6 +142,22 @@ def test_non_admin_with_service_token(self, mock_load): self.assertIsInstance(cl.httpclient.auth, service_token.ServiceTokenAuthWrapper) + @mock.patch('nova.service_auth._SERVICE_AUTH') + @mock.patch('nova.network.neutron._ADMIN_AUTH') + @mock.patch.object(ks_loading, 'load_auth_from_conf_options') + def test_admin_with_service_token( + self, mock_load, mock_admin_auth, mock_service_auth + ): + self.flags(send_service_user_token=True, group='service_user') + + admin_context = context.get_admin_context() + + cl = neutronapi.get_client(admin_context) + self.assertIsInstance(cl.httpclient.auth, + service_token.ServiceTokenAuthWrapper) + self.assertEqual(mock_admin_auth, cl.httpclient.auth.user_auth) + self.assertEqual(mock_service_auth, cl.httpclient.auth.service_auth) + @mock.patch.object(client.Client, "list_networks", side_effect=exceptions.Unauthorized()) def test_Unauthorized_user(self, mock_list_networks): diff --git a/nova/tests/unit/test_service_auth.py b/nova/tests/unit/test_service_auth.py index 5f075151880..8966af3ce32 100644 --- a/nova/tests/unit/test_service_auth.py +++ b/nova/tests/unit/test_service_auth.py @@ -56,3 +56,13 @@ def test_get_auth_plugin_wraps_bad_config(self, mock_load): result = service_auth.get_auth_plugin(self.ctx) self.assertEqual(1, mock_load.call_count) self.assertNotIsInstance(result, service_token.ServiceTokenAuthWrapper) + + @mock.patch.object(ks_loading, 'load_auth_from_conf_options', + new=mock.Mock()) + def test_get_auth_plugin_user_auth(self): + self.flags(send_service_user_token=True, group='service_user') + user_auth = mock.Mock() + + result = service_auth.get_auth_plugin(self.ctx, user_auth=user_auth) + + self.assertEqual(user_auth, result.user_auth) diff --git a/nova/tests/unit/volume/test_cinder.py b/nova/tests/unit/volume/test_cinder.py index e53ebe3cb8f..f9080726fbb 100644 --- a/nova/tests/unit/volume/test_cinder.py +++ b/nova/tests/unit/volume/test_cinder.py @@ -1276,3 +1276,14 @@ def test_admin_context_without_token(self, admin_ctx = context.get_admin_context() params = cinder._get_cinderclient_parameters(admin_ctx) self.assertEqual(params[0], mock_admin_auth) + + @mock.patch('nova.service_auth._SERVICE_AUTH') + @mock.patch('nova.volume.cinder._ADMIN_AUTH') + def test_admin_context_without_user_token_but_with_service_token( + self, mock_admin_auth, mock_service_auth + ): + self.flags(send_service_user_token=True, group='service_user') + admin_ctx = context.get_admin_context() + params = cinder._get_cinderclient_parameters(admin_ctx) + self.assertEqual(mock_admin_auth, params[0].user_auth) + self.assertEqual(mock_service_auth, params[0].service_auth) diff --git a/nova/volume/cinder.py b/nova/volume/cinder.py index 01efcfec19b..f5328148d24 100644 --- a/nova/volume/cinder.py +++ b/nova/volume/cinder.py @@ -91,12 +91,14 @@ def _get_auth(context): # from them generated from 'context.get_admin_context' # which only set is_admin=True but is without token. # So add load_auth_plugin when this condition appear. + user_auth = None if context.is_admin and not context.auth_token: if not _ADMIN_AUTH: _ADMIN_AUTH = _load_auth_plugin(CONF) - return _ADMIN_AUTH - else: - return service_auth.get_auth_plugin(context) + user_auth = _ADMIN_AUTH + + # When user_auth = None, user_auth will be extracted from the context. + return service_auth.get_auth_plugin(context, user_auth=user_auth) # NOTE(efried): Bug #1752152 From 9e86be5a5365b1896d489de7149e471fd22881d6 Mon Sep 17 00:00:00 2001 From: Jorge San Emeterio Date: Wed, 8 Feb 2023 15:33:54 +0100 Subject: [PATCH 25/73] Have host look for CPU controller of cgroupsv2 location. Make the host class look under '/sys/fs/cgroup/cgroup.controllers' for support of the cpu controller. The host will try searching through cgroupsv1 first, just like up until now, and in the case that fails, it will try cgroupsv2 then. The host will not support the feature if both checks fail. This new check needs to be mocked by all tests that focus on this piece of code, as it touches a system file that requires privileges. For such thing, the CGroupsFixture is defined to easily add suck mocking to all test cases that require so. I also removed old mocking at test_driver.py in favor of the fixture from above. Partial-Bug: #2008102 Change-Id: I99b57c27c8a4425389bec2b7f05af660bab85610 (cherry picked from commit 973ff4fc1a0586937d13f2b39e517422713b1003) (cherry picked from commit eb3fe4ddc621380afa32ec9aec0c285f36f99ee3) --- nova/tests/fixtures/nova.py | 71 +++++++++++++++++++ nova/tests/functional/libvirt/base.py | 1 + .../tests/functional/libvirt/test_evacuate.py | 1 + nova/tests/functional/libvirt/test_vpmem.py | 1 + .../regressions/test_bug_1595962.py | 1 + nova/tests/unit/virt/libvirt/test_driver.py | 50 ++++--------- nova/tests/unit/virt/libvirt/test_host.py | 64 +++++++++++++---- nova/tests/unit/virt/test_virt_drivers.py | 1 + nova/virt/libvirt/host.py | 31 +++++++- 9 files changed, 170 insertions(+), 51 deletions(-) diff --git a/nova/tests/fixtures/nova.py b/nova/tests/fixtures/nova.py index 129b2f9abb0..a33df9b9465 100644 --- a/nova/tests/fixtures/nova.py +++ b/nova/tests/fixtures/nova.py @@ -1316,6 +1316,77 @@ def setUp(self): nova.privsep.sys_admin_pctxt, 'client_mode', False)) +class CGroupsFixture(fixtures.Fixture): + """Mocks checks made for available subsystems on the host's control group. + + The fixture mocks all calls made on the host to verify the capabilities + provided by its kernel. Through this, one can simulate the underlying + system hosts work on top of and have tests react to expected outcomes from + such. + + Use sample: + >>> cgroups = self.useFixture(CGroupsFixture()) + >>> cgroups = self.useFixture(CGroupsFixture(version=2)) + >>> cgroups = self.useFixture(CGroupsFixture()) + ... cgroups.version = 2 + + :attr version: Arranges mocks to simulate the host interact with nova + following the given version of cgroups. + Available values are: + - 0: All checks related to cgroups will return False. + - 1: Checks related to cgroups v1 will return True. + - 2: Checks related to cgroups v2 will return True. + Defaults to 1. + """ + + def __init__(self, version=1): + self._cpuv1 = None + self._cpuv2 = None + + self._version = version + + @property + def version(self): + return self._version + + @version.setter + def version(self, value): + self._version = value + self._update_mocks() + + def setUp(self): + super().setUp() + self._cpuv1 = self.useFixture(fixtures.MockPatch( + 'nova.virt.libvirt.host.Host._has_cgroupsv1_cpu_controller')).mock + self._cpuv2 = self.useFixture(fixtures.MockPatch( + 'nova.virt.libvirt.host.Host._has_cgroupsv2_cpu_controller')).mock + self._update_mocks() + + def _update_mocks(self): + if not self._cpuv1: + return + + if not self._cpuv2: + return + + if self.version == 0: + self._cpuv1.return_value = False + self._cpuv2.return_value = False + return + + if self.version == 1: + self._cpuv1.return_value = True + self._cpuv2.return_value = False + return + + if self.version == 2: + self._cpuv1.return_value = False + self._cpuv2.return_value = True + return + + raise ValueError(f"Unknown cgroups version: '{self.version}'.") + + class NoopQuotaDriverFixture(fixtures.Fixture): """A fixture to run tests using the NoopQuotaDriver. diff --git a/nova/tests/functional/libvirt/base.py b/nova/tests/functional/libvirt/base.py index 47a8bbe81c9..ab59410399d 100644 --- a/nova/tests/functional/libvirt/base.py +++ b/nova/tests/functional/libvirt/base.py @@ -42,6 +42,7 @@ def setUp(self): super(ServersTestBase, self).setUp() self.useFixture(nova_fixtures.LibvirtImageBackendFixture()) + self.useFixture(nova_fixtures.CGroupsFixture()) self.libvirt = self.useFixture(nova_fixtures.LibvirtFixture()) self.useFixture(nova_fixtures.OSBrickFixture()) diff --git a/nova/tests/functional/libvirt/test_evacuate.py b/nova/tests/functional/libvirt/test_evacuate.py index 9d3deec99db..c7e0abed531 100644 --- a/nova/tests/functional/libvirt/test_evacuate.py +++ b/nova/tests/functional/libvirt/test_evacuate.py @@ -427,6 +427,7 @@ def setUp(self): self.useFixture(nova_fixtures.NeutronFixture(self)) self.useFixture(nova_fixtures.GlanceFixture(self)) self.useFixture(func_fixtures.PlacementFixture()) + self.useFixture(nova_fixtures.CGroupsFixture()) fake_network.set_stub_network_methods(self) api_fixture = self.useFixture( diff --git a/nova/tests/functional/libvirt/test_vpmem.py b/nova/tests/functional/libvirt/test_vpmem.py index d1cad0e376c..b76e154997c 100644 --- a/nova/tests/functional/libvirt/test_vpmem.py +++ b/nova/tests/functional/libvirt/test_vpmem.py @@ -75,6 +75,7 @@ def setUp(self): 'nova.privsep.libvirt.get_pmem_namespaces', return_value=self.fake_pmem_namespaces)) self.useFixture(nova_fixtures.LibvirtImageBackendFixture()) + self.useFixture(nova_fixtures.CGroupsFixture()) self.useFixture(fixtures.MockPatch( 'nova.virt.libvirt.LibvirtDriver._get_local_gb_info', return_value={'total': 128, diff --git a/nova/tests/functional/regressions/test_bug_1595962.py b/nova/tests/functional/regressions/test_bug_1595962.py index 94421a81f90..9232eea3351 100644 --- a/nova/tests/functional/regressions/test_bug_1595962.py +++ b/nova/tests/functional/regressions/test_bug_1595962.py @@ -47,6 +47,7 @@ def setUp(self): 'nova.virt.libvirt.guest.libvirt', fakelibvirt)) self.useFixture(nova_fixtures.LibvirtFixture()) + self.useFixture(nova_fixtures.CGroupsFixture()) self.admin_api = api_fixture.admin_api self.api = api_fixture.api diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index 86e3661a343..89de8aa8df7 100644 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -740,6 +740,7 @@ def setUp(self): imagebackend.Image._get_driver_format) self.libvirt = self.useFixture(nova_fixtures.LibvirtFixture()) + self.cgroups = self.useFixture(nova_fixtures.CGroupsFixture()) # ensure tests perform the same on all host architectures; this is # already done by the fakelibvirt fixture but we want to change the @@ -3047,9 +3048,7 @@ def test_get_live_migrate_numa_info_empty(self, _): 'fake-flavor', 'fake-image-meta').obj_to_primitive()) @mock.patch.object(host.Host, "_check_machine_type", new=mock.Mock()) - @mock.patch.object( - host.Host, "is_cpu_control_policy_capable", return_value=True) - def test_get_guest_config_numa_host_instance_fits(self, is_able): + def test_get_guest_config_numa_host_instance_fits(self): self.flags(cpu_shared_set=None, cpu_dedicated_set=None, group='compute') instance_ref = objects.Instance(**self.test_instance) @@ -3087,9 +3086,7 @@ def test_get_guest_config_numa_host_instance_fits(self, is_able): @mock.patch.object(host.Host, "_check_machine_type", new=mock.Mock()) @mock.patch('nova.privsep.utils.supports_direct_io', new=mock.Mock(return_value=True)) - @mock.patch.object( - host.Host, "is_cpu_control_policy_capable", return_value=True) - def test_get_guest_config_numa_host_instance_no_fit(self, is_able): + def test_get_guest_config_numa_host_instance_no_fit(self): instance_ref = objects.Instance(**self.test_instance) image_meta = objects.ImageMeta.from_dict(self.test_image_meta) flavor = objects.Flavor(memory_mb=4096, vcpus=4, root_gb=496, @@ -3516,10 +3513,7 @@ def test_get_guest_memory_backing_config_file_backed_hugepages(self): host_topology, inst_topology, numa_tune) @mock.patch.object(host.Host, "_check_machine_type", new=mock.Mock()) - @mock.patch.object( - host.Host, "is_cpu_control_policy_capable", return_value=True) - def test_get_guest_config_numa_host_instance_pci_no_numa_info( - self, is_able): + def test_get_guest_config_numa_host_instance_pci_no_numa_info(self): self.flags(cpu_shared_set='3', cpu_dedicated_set=None, group='compute') @@ -3573,10 +3567,7 @@ def test_get_guest_config_numa_host_instance_pci_no_numa_info( @mock.patch.object(host.Host, "_check_machine_type", new=mock.Mock()) @mock.patch('nova.privsep.utils.supports_direct_io', new=mock.Mock(return_value=True)) - @mock.patch.object( - host.Host, "is_cpu_control_policy_capable", return_value=True) - def test_get_guest_config_numa_host_instance_2pci_no_fit( - self, is_able): + def test_get_guest_config_numa_host_instance_2pci_no_fit(self): self.flags(cpu_shared_set='3', cpu_dedicated_set=None, group='compute') instance_ref = objects.Instance(**self.test_instance) @@ -3693,10 +3684,7 @@ def test_get_guest_config_numa_other_arch_qemu(self): None) @mock.patch.object(host.Host, "_check_machine_type", new=mock.Mock()) - @mock.patch.object( - host.Host, "is_cpu_control_policy_capable", return_value=True) - def test_get_guest_config_numa_host_instance_fit_w_cpu_pinset( - self, is_able): + def test_get_guest_config_numa_host_instance_fit_w_cpu_pinset(self): self.flags(cpu_shared_set='2-3', cpu_dedicated_set=None, group='compute') @@ -3735,10 +3723,7 @@ def test_get_guest_config_numa_host_instance_fit_w_cpu_pinset( self.assertIsNone(cfg.cpu.numa) @mock.patch.object(host.Host, "_check_machine_type", new=mock.Mock()) - @mock.patch.object( - host.Host, "is_cpu_control_policy_capable", return_value=True) - def test_get_guest_config_non_numa_host_instance_topo( - self, is_able): + def test_get_guest_config_non_numa_host_instance_topo(self): instance_topology = objects.InstanceNUMATopology(cells=[ objects.InstanceNUMACell( id=0, cpuset=set([0]), pcpuset=set(), memory=1024), @@ -3786,10 +3771,7 @@ def test_get_guest_config_non_numa_host_instance_topo( numa_cfg_cell.memory) @mock.patch.object(host.Host, "_check_machine_type", new=mock.Mock()) - @mock.patch.object( - host.Host, "is_cpu_control_policy_capable", return_value=True) - def test_get_guest_config_numa_host_instance_topo( - self, is_able): + def test_get_guest_config_numa_host_instance_topo(self): self.flags(cpu_shared_set='0-5', cpu_dedicated_set=None, group='compute') @@ -7199,9 +7181,7 @@ def test_get_guest_config_with_rng_dev_not_present(self, mock_path): [], image_meta, disk_info) - @mock.patch.object( - host.Host, "is_cpu_control_policy_capable", return_value=True) - def test_get_guest_config_with_cpu_quota(self, is_able): + def test_get_guest_config_with_cpu_quota(self): self.flags(virt_type='kvm', group='libvirt') drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True) @@ -7537,9 +7517,7 @@ def test_get_guest_config_disk_cachemodes_network( self.flags(images_type='rbd', group='libvirt') self._test_get_guest_config_disk_cachemodes('rbd') - @mock.patch.object( - host.Host, "is_cpu_control_policy_capable", return_value=True) - def test_get_guest_config_with_bogus_cpu_quota(self, is_able): + def test_get_guest_config_with_bogus_cpu_quota(self): self.flags(virt_type='kvm', group='libvirt') drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True) @@ -7557,9 +7535,10 @@ def test_get_guest_config_with_bogus_cpu_quota(self, is_able): drvr._get_guest_config, instance_ref, [], image_meta, disk_info) - @mock.patch.object( - host.Host, "is_cpu_control_policy_capable", return_value=False) - def test_get_update_guest_cputune(self, is_able): + def test_get_update_guest_cputune(self): + # No CPU controller on the host + self.cgroups.version = 0 + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True) instance_ref = objects.Instance(**self.test_instance) instance_ref.flavor.extra_specs = {'quota:cpu_shares': '10000', @@ -22110,6 +22089,7 @@ def setUp(self): self.flags(sysinfo_serial="none", group="libvirt") self.flags(instances_path=self.useFixture(fixtures.TempDir()).path) self.useFixture(nova_fixtures.LibvirtFixture()) + self.useFixture(nova_fixtures.CGroupsFixture()) os_vif.initialize() self.drvr = libvirt_driver.LibvirtDriver( diff --git a/nova/tests/unit/virt/libvirt/test_host.py b/nova/tests/unit/virt/libvirt/test_host.py index 3afd6c139df..534d4489d70 100644 --- a/nova/tests/unit/virt/libvirt/test_host.py +++ b/nova/tests/unit/virt/libvirt/test_host.py @@ -1613,25 +1613,59 @@ def test_compare_cpu(self, mock_compareCPU): self.host.compare_cpu("cpuxml") mock_compareCPU.assert_called_once_with("cpuxml", 0) - def test_is_cpu_control_policy_capable_ok(self): + def test_is_cpu_control_policy_capable_via_neither(self): + self.useFixture(nova_fixtures.CGroupsFixture(version=0)) + self.assertFalse(self.host.is_cpu_control_policy_capable()) + + def test_is_cpu_control_policy_capable_via_cgroupsv1(self): + self.useFixture(nova_fixtures.CGroupsFixture(version=1)) + self.assertTrue(self.host.is_cpu_control_policy_capable()) + + def test_is_cpu_control_policy_capable_via_cgroupsv2(self): + self.useFixture(nova_fixtures.CGroupsFixture(version=2)) + self.assertTrue(self.host.is_cpu_control_policy_capable()) + + def test_has_cgroupsv1_cpu_controller_ok(self): m = mock.mock_open( - read_data="""cg /cgroup/cpu,cpuacct cg opt1,cpu,opt3 0 0 -cg /cgroup/memory cg opt1,opt2 0 0 -""") - with mock.patch('builtins.open', m, create=True): - self.assertTrue(self.host.is_cpu_control_policy_capable()) + read_data=( + "cg /cgroup/cpu,cpuacct cg opt1,cpu,opt3 0 0" + "cg /cgroup/memory cg opt1,opt2 0 0" + ) + ) + with mock.patch("builtins.open", m, create=True): + self.assertTrue(self.host._has_cgroupsv1_cpu_controller()) - def test_is_cpu_control_policy_capable_ko(self): + def test_has_cgroupsv1_cpu_controller_ko(self): m = mock.mock_open( - read_data="""cg /cgroup/cpu,cpuacct cg opt1,opt2,opt3 0 0 -cg /cgroup/memory cg opt1,opt2 0 0 -""") - with mock.patch('builtins.open', m, create=True): - self.assertFalse(self.host.is_cpu_control_policy_capable()) + read_data=( + "cg /cgroup/cpu,cpuacct cg opt1,opt2,opt3 0 0" + "cg /cgroup/memory cg opt1,opt2 0 0" + ) + ) + with mock.patch("builtins.open", m, create=True): + self.assertFalse(self.host._has_cgroupsv1_cpu_controller()) - @mock.patch('builtins.open', side_effect=IOError) - def test_is_cpu_control_policy_capable_ioerror(self, mock_open): - self.assertFalse(self.host.is_cpu_control_policy_capable()) + @mock.patch("builtins.open", side_effect=IOError) + def test_has_cgroupsv1_cpu_controller_ioerror(self, _): + self.assertFalse(self.host._has_cgroupsv1_cpu_controller()) + + def test_has_cgroupsv2_cpu_controller_ok(self): + m = mock.mock_open( + read_data="cpuset cpu io memory hugetlb pids rdma misc" + ) + with mock.patch("builtins.open", m, create=True): + self.assertTrue(self.host._has_cgroupsv2_cpu_controller()) + + def test_has_cgroupsv2_cpu_controller_ko(self): + m = mock.mock_open( + read_data="memory pids" + ) + with mock.patch("builtins.open", m, create=True): + self.assertFalse(self.host._has_cgroupsv2_cpu_controller()) + + @mock.patch("builtins.open", side_effect=IOError) + def test_has_cgroupsv2_cpu_controller_ioerror(self, _): + self.assertFalse(self.host._has_cgroupsv2_cpu_controller()) def test_get_canonical_machine_type(self): # this test relies on configuration from the FakeLibvirtFixture diff --git a/nova/tests/unit/virt/test_virt_drivers.py b/nova/tests/unit/virt/test_virt_drivers.py index 58fa3d4c272..ed9f1e3822d 100644 --- a/nova/tests/unit/virt/test_virt_drivers.py +++ b/nova/tests/unit/virt/test_virt_drivers.py @@ -832,6 +832,7 @@ def setUp(self): # This is needed for the live migration tests which spawn off the # operation for monitoring. self.useFixture(nova_fixtures.SpawnIsSynchronousFixture()) + self.useFixture(nova_fixtures.CGroupsFixture()) # When destroying an instance, os-vif will try to execute some commands # which hang tests so let's just stub out the unplug call to os-vif # since we don't care about it. diff --git a/nova/virt/libvirt/host.py b/nova/virt/libvirt/host.py index 46435a9a7fd..7d22e9c8419 100644 --- a/nova/virt/libvirt/host.py +++ b/nova/virt/libvirt/host.py @@ -1611,15 +1611,44 @@ def is_cpu_control_policy_capable(self): CONFIG_CGROUP_SCHED may be disabled in some kernel configs to improve scheduler latency. """ + return self._has_cgroupsv1_cpu_controller() or \ + self._has_cgroupsv2_cpu_controller() + + def _has_cgroupsv1_cpu_controller(self): + LOG.debug(f"Searching host: '{self.get_hostname()}' " + "for CPU controller through CGroups V1...") try: with open("/proc/self/mounts", "r") as fd: for line in fd.readlines(): # mount options and split options bits = line.split()[3].split(",") if "cpu" in bits: + LOG.debug("CPU controller found on host.") + return True + LOG.debug("CPU controller missing on host.") + return False + except IOError as ex: + LOG.debug(f"Search failed due to: '{ex}'. " + "Maybe the host is not running under CGroups V1. " + "Deemed host to be missing controller by this approach.") + return False + + def _has_cgroupsv2_cpu_controller(self): + LOG.debug(f"Searching host: '{self.get_hostname()}' " + "for CPU controller through CGroups V2...") + try: + with open("/sys/fs/cgroup/cgroup.controllers", "r") as fd: + for line in fd.readlines(): + bits = line.split() + if "cpu" in bits: + LOG.debug("CPU controller found on host.") return True + LOG.debug("CPU controller missing on host.") return False - except IOError: + except IOError as ex: + LOG.debug(f"Search failed due to: '{ex}'. " + "Maybe the host is not running under CGroups V2. " + "Deemed host to be missing controller by this approach.") return False def get_canonical_machine_type(self, arch, machine) -> str: From abd9a34a6014730620cee15a44f328e48e57398e Mon Sep 17 00:00:00 2001 From: Elod Illes Date: Thu, 11 May 2023 16:19:38 +0200 Subject: [PATCH 26/73] CI: fix backport validator for new branch naming validate-backport job started to fail as only old stable branch naming is accepted. This patch extends the script to allow numbers and dot as well in the branch names (like stable/2023.1). Change-Id: Icbdcd5d124717e195d55d9e42530611ed812fadd (cherry picked from commit fe125da63b6508788654f0dab721f13005c09d25) (cherry picked from commit 09f85a8a922e4ad68271886d2389042d4f4d6896) --- tools/check-cherry-picks.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/check-cherry-picks.sh b/tools/check-cherry-picks.sh index 46cef8c2250..3042aa16593 100755 --- a/tools/check-cherry-picks.sh +++ b/tools/check-cherry-picks.sh @@ -23,7 +23,7 @@ hashes=$(git show --format='%b' --quiet $commit_hash | sed -nr 's/^.cherry picke checked=0 branches+="" for hash in $hashes; do - branch=$(git branch -a --contains "$hash" 2>/dev/null| grep -oE '(master|stable/[a-z]+)') + branch=$(git branch -a --contains "$hash" 2>/dev/null| grep -oE '(master|stable/[a-z0-9.]+)') if [ $? -ne 0 ]; then echo "Cherry pick hash $hash not on any master or stable branches" exit 1 From 91ee67019e0eb89290e599b2283f1f421718b796 Mon Sep 17 00:00:00 2001 From: Sylvain Bauza Date: Tue, 2 May 2023 15:51:28 +0000 Subject: [PATCH 27/73] Revert "Debug Nova APIs call failures" This reverts commit afb0f774841d30dcae9c074d524e7fa9be840678. Reason for revert: We unfortunately leak the token in the logs which is considered a security flaw, even if only provided on DEBUG level. Change-Id: I52b52e65b689dadbdb08122c94652c491f850de6 Closes-Bug: #2012993 (cherry picked from commit 6833695e70bba31b84a0a19301657bc59ae1710b) (cherry picked from commit a02f96687350ad74d9921406a525ee991bbe8882) --- nova/api/openstack/wsgi.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/nova/api/openstack/wsgi.py b/nova/api/openstack/wsgi.py index 1d17ce1c9fd..e64b4a2016a 100644 --- a/nova/api/openstack/wsgi.py +++ b/nova/api/openstack/wsgi.py @@ -538,12 +538,6 @@ def _process_stack(self, request, action, action_args, with ResourceExceptionHandler(): action_result = self.dispatch(meth, request, action_args) except Fault as ex: - LOG.debug(f'Request method failure captured:\n' - f' request: {request}\n' - f' method: {meth}\n' - f' exception: {ex}\n' - f' action_args: {action_args}\n', - exc_info=1) response = ex if not response: From 77db64237b23050d94df113a38412c5333d23357 Mon Sep 17 00:00:00 2001 From: Sylvain Bauza Date: Wed, 3 May 2023 17:00:14 +0200 Subject: [PATCH 28/73] Fix get_segments_id with subnets without segment_id Unfortunatly when we merged Ie166f3b51fddeaf916cda7c5ac34bbcdda0fd17a we forgot that subnets can have no segment_id field. Change-Id: Idb35b7e3c69fe8efe498abe4ebcc6cad8918c4ed Closes-Bug: #2018375 (cherry picked from commit 6d7bd6a03446d5227d515b2b4c0da632ef4aa4a1) (cherry picked from commit 6b8d9d419170fb0ec2c6df561a0874e6362382c1) --- nova/network/neutron.py | 2 +- nova/tests/unit/network/test_neutron.py | 18 +++++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/nova/network/neutron.py b/nova/network/neutron.py index affd76535fc..dd3e4de05db 100644 --- a/nova/network/neutron.py +++ b/nova/network/neutron.py @@ -3896,7 +3896,7 @@ def get_segment_ids_for_network( 'Failed to get segment IDs for network %s' % network_id) from e # The segment field of an unconfigured subnet could be None return [subnet['segment_id'] for subnet in subnets - if subnet['segment_id'] is not None] + if subnet.get('segment_id') is not None] def get_segment_id_for_subnet( self, diff --git a/nova/tests/unit/network/test_neutron.py b/nova/tests/unit/network/test_neutron.py index fec66fb2d31..c551191e4cf 100644 --- a/nova/tests/unit/network/test_neutron.py +++ b/nova/tests/unit/network/test_neutron.py @@ -7431,7 +7431,7 @@ def test_get_segment_ids_for_network_passes(self, mock_client): network_id=uuids.network_id, fields='segment_id') @mock.patch.object(neutronapi, 'get_client') - def test_get_segment_ids_for_network_with_no_segments(self, mock_client): + def test_get_segment_ids_for_network_with_segments_none(self, mock_client): subnets = {'subnets': [{'segment_id': None}]} mocked_client = mock.create_autospec(client.Client) mock_client.return_value = mocked_client @@ -7446,6 +7446,22 @@ def test_get_segment_ids_for_network_with_no_segments(self, mock_client): mocked_client.list_subnets.assert_called_once_with( network_id=uuids.network_id, fields='segment_id') + @mock.patch.object(neutronapi, 'get_client') + def test_get_segment_ids_for_network_with_no_segments(self, mock_client): + subnets = {'subnets': [{}]} + mocked_client = mock.create_autospec(client.Client) + mock_client.return_value = mocked_client + mocked_client.list_subnets.return_value = subnets + with mock.patch.object( + self.api, 'has_segment_extension', return_value=True, + ): + res = self.api.get_segment_ids_for_network( + self.context, uuids.network_id) + self.assertEqual([], res) + mock_client.assert_called_once_with(self.context, admin=True) + mocked_client.list_subnets.assert_called_once_with( + network_id=uuids.network_id, fields='segment_id') + @mock.patch.object(neutronapi, 'get_client') def test_get_segment_ids_for_network_fails(self, mock_client): mocked_client = mock.create_autospec(client.Client) From 2f1d65774fbcf5c25c4ba53583b6a802a03f4c4d Mon Sep 17 00:00:00 2001 From: Yusuke Okada Date: Wed, 8 Feb 2023 22:10:31 -0500 Subject: [PATCH 29/73] Fix failed count for anti-affinity check The late anti-affinity check runs in the compute manager to avoid parallel scheduling requests to invalidate the anti-affinity server group policy. When the check fails the instance is re-scheduled. However this failure counted as a real instance boot failure of the compute host and can lead to de-prioritization of the compute host in the scheduler via BuildFailureWeigher. As the late anti-affinity check does not indicate any fault of the compute host itself it should not be counted towards the build failure counter. This patch adds new build results to handle this case. Closes-Bug: #1996732 Change-Id: I2ba035c09ace20e9835d9d12a5c5bee17d616718 Signed-off-by: Yusuke Okada (cherry picked from commit 56d320a203a13f262a2e94e491af222032e453d3) (cherry picked from commit 1b56714e9119ab4152e6f33985a499b2d83a491b) --- nova/compute/build_results.py | 8 ++ nova/compute/manager.py | 33 +++-- nova/exception.py | 9 ++ nova/tests/functional/test_server_group.py | 80 +++++++++++ nova/tests/unit/compute/test_compute_mgr.py | 149 +++++++++++++++++++- 5 files changed, 265 insertions(+), 14 deletions(-) diff --git a/nova/compute/build_results.py b/nova/compute/build_results.py index ca9ed51410f..a091c89ff65 100644 --- a/nova/compute/build_results.py +++ b/nova/compute/build_results.py @@ -24,3 +24,11 @@ ACTIVE = 'active' # Instance is running FAILED = 'failed' # Instance failed to build and was not rescheduled RESCHEDULED = 'rescheduled' # Instance failed to build, but was rescheduled +# Instance failed by policy violation (such as affinity or anti-affinity) +# and was not rescheduled. In this case, the node's failed count won't be +# increased. +FAILED_BY_POLICY = 'failed_by_policy' +# Instance failed by policy violation (such as affinity or anti-affinity) +# but was rescheduled. In this case, the node's failed count won't be +# increased. +RESCHEDULED_BY_POLICY = 'rescheduled_by_policy' diff --git a/nova/compute/manager.py b/nova/compute/manager.py index d29348097fb..d216812a351 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -1803,11 +1803,8 @@ def _do_validation(context, instance, group): else: max_server = 1 if len(members_on_host) >= max_server: - msg = _("Anti-affinity instance group policy " - "was violated.") - raise exception.RescheduledException( - instance_uuid=instance.uuid, - reason=msg) + raise exception.GroupAffinityViolation( + instance_uuid=instance.uuid, policy='Anti-affinity') # NOTE(ganso): The check for affinity below does not work and it # can easily be violated because the lock happens in different @@ -1817,10 +1814,8 @@ def _do_validation(context, instance, group): elif group.policy and 'affinity' == group.policy: group_hosts = group.get_hosts(exclude=[instance.uuid]) if group_hosts and self.host not in group_hosts: - msg = _("Affinity instance group policy was violated.") - raise exception.RescheduledException( - instance_uuid=instance.uuid, - reason=msg) + raise exception.GroupAffinityViolation( + instance_uuid=instance.uuid, policy='Affinity') _do_validation(context, instance, group) @@ -2260,6 +2255,9 @@ def _locked_do_build_and_run_instance(*args, **kwargs): self.reportclient.delete_allocation_for_instance( context, instance.uuid, force=True) + if result in (build_results.FAILED_BY_POLICY, + build_results.RESCHEDULED_BY_POLICY): + return if result in (build_results.FAILED, build_results.RESCHEDULED): self._build_failed(node) @@ -2358,6 +2356,8 @@ def _do_build_and_run_instance(self, context, instance, image, self._nil_out_instance_obj_host_and_node(instance) self._set_instance_obj_error_state(instance, clean_task_state=True) + if isinstance(e, exception.RescheduledByPolicyException): + return build_results.FAILED_BY_POLICY return build_results.FAILED LOG.debug(e.format_message(), instance=instance) # This will be used for logging the exception @@ -2384,6 +2384,10 @@ def _do_build_and_run_instance(self, context, instance, image, injected_files, requested_networks, security_groups, block_device_mapping, request_spec=request_spec, host_lists=[host_list]) + + if isinstance(e, exception.RescheduledByPolicyException): + return build_results.RESCHEDULED_BY_POLICY + return build_results.RESCHEDULED except (exception.InstanceNotFound, exception.UnexpectedDeletingTaskStateError): @@ -2601,6 +2605,17 @@ def _build_and_run_instance(self, context, instance, image, injected_files, bdms=block_device_mapping) raise exception.BuildAbortException(instance_uuid=instance.uuid, reason=e.format_message()) + except exception.GroupAffinityViolation as e: + LOG.exception('Failed to build and run instance', + instance=instance) + self._notify_about_instance_usage(context, instance, + 'create.error', fault=e) + compute_utils.notify_about_instance_create( + context, instance, self.host, + phase=fields.NotificationPhase.ERROR, exception=e, + bdms=block_device_mapping) + raise exception.RescheduledByPolicyException( + instance_uuid=instance.uuid, reason=str(e)) except Exception as e: LOG.exception('Failed to build and run instance', instance=instance) diff --git a/nova/exception.py b/nova/exception.py index 3d8e596312a..b0503425d0a 100644 --- a/nova/exception.py +++ b/nova/exception.py @@ -1487,6 +1487,15 @@ class RescheduledException(NovaException): "%(reason)s") +class RescheduledByPolicyException(RescheduledException): + msg_fmt = _("Build of instance %(instance_uuid)s was re-scheduled: " + "%(reason)s") + + +class GroupAffinityViolation(NovaException): + msg_fmt = _("%(policy)s instance group policy was violated") + + class InstanceFaultRollback(NovaException): def __init__(self, inner_exception=None): message = _("Instance rollback performed due to: %s") diff --git a/nova/tests/functional/test_server_group.py b/nova/tests/functional/test_server_group.py index 38804a671b5..d1d702bacee 100644 --- a/nova/tests/functional/test_server_group.py +++ b/nova/tests/functional/test_server_group.py @@ -20,6 +20,7 @@ from nova.compute import instance_actions from nova import context from nova.db.main import api as db +from nova import objects from nova import test from nova.tests import fixtures as nova_fixtures from nova.tests.functional.api import client @@ -495,6 +496,85 @@ def test_soft_affinity_not_supported(self): self.assertIn('Invalid input', ex.response.text) self.assertIn('soft-affinity', ex.response.text) + @mock.patch('nova.scheduler.filters.affinity_filter.' + 'ServerGroupAffinityFilter.host_passes', return_value=True) + def test_failed_count_with_affinity_violation(self, mock_host_passes): + """Check failed count not incremented after violation of the late + affinity check. https://bugs.launchpad.net/nova/+bug/1996732 + """ + + created_group = self.api.post_server_groups(self.affinity) + flavor = self.api.get_flavors()[2] + + # Ensure the first instance is on compute1 + with utils.temporary_mutation(self.admin_api, microversion='2.53'): + compute2_service_id = self.admin_api.get_services( + host=self.compute2.host, binary='nova-compute')[0]['id'] + self.admin_api.put_service(compute2_service_id, + {'status': 'disabled'}) + + self._boot_a_server_to_group(created_group, flavor=flavor) + + # Ensure the second instance is on compute2 + with utils.temporary_mutation(self.admin_api, microversion='2.53'): + self.admin_api.put_service(compute2_service_id, + {'status': 'enabled'}) + compute1_service_id = self.admin_api.get_services( + host=self.compute.host, binary='nova-compute')[0]['id'] + self.admin_api.put_service(compute1_service_id, + {'status': 'disabled'}) + + # Expects GroupAffinityViolation exception + failed_server = self._boot_a_server_to_group(created_group, + flavor=flavor, + expected_status='ERROR') + + self.assertEqual('Exceeded maximum number of retries. Exhausted all ' + 'hosts available for retrying build failures for ' + 'instance %s.' % failed_server['id'], + failed_server['fault']['message']) + + ctxt = context.get_admin_context() + computes = objects.ComputeNodeList.get_all(ctxt) + + for node in computes: + self.assertEqual(node.stats.get('failed_builds'), '0') + + @mock.patch('nova.scheduler.filters.affinity_filter.' + 'ServerGroupAntiAffinityFilter.host_passes', return_value=True) + def test_failed_count_with_anti_affinity_violation(self, mock_host_passes): + """Check failed count after violation of the late affinity check. + https://bugs.launchpad.net/nova/+bug/1996732 + """ + + created_group = self.api.post_server_groups(self.anti_affinity) + flavor = self.api.get_flavors()[2] + + # Ensure two instances are scheduled on the same host + with utils.temporary_mutation(self.admin_api, microversion='2.53'): + compute2_service_id = self.admin_api.get_services( + host=self.compute2.host, binary='nova-compute')[0]['id'] + self.admin_api.put_service(compute2_service_id, + {'status': 'disabled'}) + + self._boot_a_server_to_group(created_group, flavor=flavor) + + # Expects GroupAffinityViolation exception + failed_server = self._boot_a_server_to_group(created_group, + flavor=flavor, + expected_status='ERROR') + + self.assertEqual('Exceeded maximum number of retries. Exhausted all ' + 'hosts available for retrying build failures for ' + 'instance %s.' % failed_server['id'], + failed_server['fault']['message']) + + ctxt = context.get_admin_context() + computes = objects.ComputeNodeList.get_all(ctxt) + + for node in computes: + self.assertEqual(node.stats.get('failed_builds'), '0') + class ServerGroupAffinityConfTest(ServerGroupTestBase): api_major_version = 'v2.1' diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index e521283acc8..4e9bd4d7200 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -6753,13 +6753,14 @@ def test_build_and_run_instance_with_unlimited_max_concurrent_builds(self): self.compute = manager.ComputeManager() self._test_build_and_run_instance() + @mock.patch.object(manager.ComputeManager, '_build_succeeded') @mock.patch.object(objects.InstanceActionEvent, 'event_finish_with_failure') @mock.patch.object(objects.InstanceActionEvent, 'event_start') @mock.patch.object(objects.Instance, 'save') @mock.patch.object(manager.ComputeManager, '_build_and_run_instance') def _test_build_and_run_instance(self, mock_build, mock_save, - mock_start, mock_finish): + mock_start, mock_finish, mock_succeeded): self._do_build_instance_update(mock_save) orig_do_build_and_run = self.compute._do_build_and_run_instance @@ -6792,6 +6793,7 @@ def _wrapped_do_build_and_run_instance(*args, **kwargs): self.requested_networks, self.security_groups, self.block_device_mapping, self.node, self.limits, self.filter_properties, {}, self.accel_uuids) + mock_succeeded.assert_called_once_with(self.node) # This test when sending an icehouse compatible rpc call to juno compute # node, NetworkRequest object can load from three items tuple. @@ -6819,6 +6821,7 @@ def test_build_and_run_instance_with_icehouse_requested_network( self.assertEqual('10.0.0.1', str(requested_network.address)) self.assertEqual(uuids.port_instance, requested_network.port_id) + @mock.patch.object(manager.ComputeManager, '_build_failed') @mock.patch.object(objects.InstanceActionEvent, 'event_finish_with_failure') @mock.patch.object(objects.InstanceActionEvent, 'event_start') @@ -6834,7 +6837,7 @@ def test_build_and_run_instance_with_icehouse_requested_network( def test_build_abort_exception(self, mock_build_run, mock_build, mock_set, mock_nil, mock_add, mock_clean_vol, mock_clean_net, mock_save, - mock_start, mock_finish): + mock_start, mock_finish, mock_failed): self._do_build_instance_update(mock_save) mock_build_run.side_effect = exception.BuildAbortException(reason='', instance_uuid=self.instance.uuid) @@ -6877,7 +6880,9 @@ def _wrapped_do_build_and_run_instance(*args, **kwargs): mock.ANY, mock.ANY) mock_nil.assert_called_once_with(self.instance) mock_set.assert_called_once_with(self.instance, clean_task_state=True) + mock_failed.assert_called_once_with(self.node) + @mock.patch.object(manager.ComputeManager, '_build_failed') @mock.patch.object(objects.InstanceActionEvent, 'event_finish_with_failure') @mock.patch.object(objects.InstanceActionEvent, 'event_start') @@ -6888,8 +6893,8 @@ def _wrapped_do_build_and_run_instance(*args, **kwargs): @mock.patch.object(conductor_api.ComputeTaskAPI, 'build_instances') @mock.patch.object(manager.ComputeManager, '_build_and_run_instance') def test_rescheduled_exception(self, mock_build_run, - mock_build, mock_set, mock_nil, - mock_save, mock_start, mock_finish): + mock_build, mock_set, mock_nil, mock_save, + mock_start, mock_finish, mock_failed): self._do_build_instance_update(mock_save, reschedule_update=True) mock_build_run.side_effect = exception.RescheduledException(reason='', instance_uuid=self.instance.uuid) @@ -6936,6 +6941,7 @@ def _wrapped_do_build_and_run_instance(*args, **kwargs): self.admin_pass, self.injected_files, self.requested_networks, self.security_groups, self.block_device_mapping, request_spec={}, host_lists=[fake_host_list]) + mock_failed.assert_called_once_with(self.node) @mock.patch.object(manager.ComputeManager, '_shutdown_instance') @mock.patch.object(manager.ComputeManager, '_build_networks_for_instance') @@ -7289,6 +7295,139 @@ def _wrapped_do_build_and_run_instance(*args, **kwargs): self.security_groups, self.block_device_mapping, request_spec={}, host_lists=[fake_host_list]) + @mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim', + new=mock.MagicMock()) + @mock.patch.object(objects.InstanceActionEvent, + 'event_finish_with_failure') + @mock.patch.object(objects.InstanceActionEvent, 'event_start') + @mock.patch.object(objects.Instance, 'save') + @mock.patch.object(manager.ComputeManager, + '_nil_out_instance_obj_host_and_node') + @mock.patch.object(conductor_api.ComputeTaskAPI, 'build_instances') + @mock.patch.object(manager.ComputeManager, '_build_failed') + @mock.patch.object(manager.ComputeManager, '_build_succeeded') + @mock.patch.object(manager.ComputeManager, + '_validate_instance_group_policy') + def test_group_affinity_violation_exception_with_retry( + self, mock_validate_policy, mock_succeeded, mock_failed, mock_build, + mock_nil, mock_save, mock_start, mock_finish, + ): + """Test retry by affinity or anti-affinity validation check doesn't + increase failed build + """ + + self._do_build_instance_update(mock_save, reschedule_update=True) + mock_validate_policy.side_effect = \ + exception.GroupAffinityViolation( + instance_uuid=self.instance.uuid, policy="Affinity") + + orig_do_build_and_run = self.compute._do_build_and_run_instance + + def _wrapped_do_build_and_run_instance(*args, **kwargs): + ret = orig_do_build_and_run(*args, **kwargs) + self.assertEqual(build_results.RESCHEDULED_BY_POLICY, ret) + return ret + + with test.nested( + mock.patch.object( + self.compute, '_do_build_and_run_instance', + side_effect=_wrapped_do_build_and_run_instance, + ), + mock.patch.object( + self.compute.network_api, 'get_instance_nw_info', + ), + ): + self.compute.build_and_run_instance( + self.context, self.instance, + self.image, request_spec={}, + filter_properties=self.filter_properties, + accel_uuids=self.accel_uuids, + injected_files=self.injected_files, + admin_password=self.admin_pass, + requested_networks=self.requested_networks, + security_groups=self.security_groups, + block_device_mapping=self.block_device_mapping, node=self.node, + limits=self.limits, host_list=fake_host_list) + + mock_succeeded.assert_not_called() + mock_failed.assert_not_called() + + self._instance_action_events(mock_start, mock_finish) + self._assert_build_instance_update(mock_save, reschedule_update=True) + mock_nil.assert_called_once_with(self.instance) + mock_build.assert_called_once_with(self.context, + [self.instance], self.image, self.filter_properties, + self.admin_pass, self.injected_files, self.requested_networks, + self.security_groups, self.block_device_mapping, + request_spec={}, host_lists=[fake_host_list]) + + @mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim', + new=mock.MagicMock()) + @mock.patch.object(objects.InstanceActionEvent, + 'event_finish_with_failure') + @mock.patch.object(objects.InstanceActionEvent, 'event_start') + @mock.patch.object(objects.Instance, 'save') + @mock.patch.object(manager.ComputeManager, + '_nil_out_instance_obj_host_and_node') + @mock.patch.object(manager.ComputeManager, '_cleanup_allocated_networks') + @mock.patch.object(manager.ComputeManager, '_set_instance_obj_error_state') + @mock.patch.object(compute_utils, 'add_instance_fault_from_exc') + @mock.patch.object(conductor_api.ComputeTaskAPI, 'build_instances') + @mock.patch.object(manager.ComputeManager, '_build_failed') + @mock.patch.object(manager.ComputeManager, '_build_succeeded') + @mock.patch.object(manager.ComputeManager, + '_validate_instance_group_policy') + def test_group_affinity_violation_exception_without_retry( + self, mock_validate_policy, mock_succeeded, mock_failed, mock_build, + mock_add, mock_set_state, mock_clean_net, mock_nil, mock_save, + mock_start, mock_finish, + ): + """Test failure by affinity or anti-affinity validation check doesn't + increase failed build + """ + + self._do_build_instance_update(mock_save) + mock_validate_policy.side_effect = \ + exception.GroupAffinityViolation( + instance_uuid=self.instance.uuid, policy="Affinity") + + orig_do_build_and_run = self.compute._do_build_and_run_instance + + def _wrapped_do_build_and_run_instance(*args, **kwargs): + ret = orig_do_build_and_run(*args, **kwargs) + self.assertEqual(build_results.FAILED_BY_POLICY, ret) + return ret + + with mock.patch.object( + self.compute, '_do_build_and_run_instance', + side_effect=_wrapped_do_build_and_run_instance, + ): + self.compute.build_and_run_instance( + self.context, self.instance, + self.image, request_spec={}, + filter_properties={}, + accel_uuids=[], + injected_files=self.injected_files, + admin_password=self.admin_pass, + requested_networks=self.requested_networks, + security_groups=self.security_groups, + block_device_mapping=self.block_device_mapping, node=self.node, + limits=self.limits, host_list=fake_host_list) + + mock_succeeded.assert_not_called() + mock_failed.assert_not_called() + + self._instance_action_events(mock_start, mock_finish) + self._assert_build_instance_update(mock_save) + mock_clean_net.assert_called_once_with(self.context, self.instance, + self.requested_networks) + mock_add.assert_called_once_with(self.context, self.instance, + mock.ANY, mock.ANY, fault_message=mock.ANY) + mock_nil.assert_called_once_with(self.instance) + mock_build.assert_not_called() + mock_set_state.assert_called_once_with(self.instance, + clean_task_state=True) + @mock.patch.object(objects.InstanceActionEvent, 'event_finish_with_failure') @mock.patch.object(objects.InstanceActionEvent, 'event_start') @@ -7868,7 +8007,7 @@ def test_validate_instance_group_policy_with_rules( nodes.return_value = ['nodename'] migration_list.return_value = [objects.Migration( uuid=uuids.migration, instance_uuid=uuids.instance)] - self.assertRaises(exception.RescheduledException, + self.assertRaises(exception.GroupAffinityViolation, self.compute._validate_instance_group_policy, self.context, instance, hints) From 710116f4beadb1553c9ad7991ea69b0e286657d6 Mon Sep 17 00:00:00 2001 From: melanie witt Date: Fri, 27 Jan 2023 00:23:06 +0000 Subject: [PATCH 30/73] Reproducer for bug 2003991 unshelving offloaded instance This adds test coverage for: * Shelve/unshelve offloaded with legacy quota usage * Shelve/unshelve offloaded with quota usage from placement * Shelve/unshelve offloaded with unified limits * Shelve/unshelve with legacy quota usage * Shelve/unshelve with quota usage from placement * Shelve/unshelve with unified limits Related-Bug: #2003991 Change-Id: Icc9b6366aebba2f8468e2127da7b7e099098513a (cherry picked from commit 427b2cb4d61cdfaf18b2467eb50b3772dffd3def) (cherry picked from commit 004a773a3a286f39889519bb5b2009fb9bf44fb1) --- nova/tests/functional/test_servers.py | 107 ++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py index d1ab84aa7b5..18da5da8b02 100644 --- a/nova/tests/functional/test_servers.py +++ b/nova/tests/functional/test_servers.py @@ -23,6 +23,7 @@ from cinderclient import exceptions as cinder_exception from keystoneauth1 import adapter from oslo_config import cfg +from oslo_limit import fixture as limit_fixture from oslo_log import log as logging from oslo_serialization import base64 from oslo_serialization import jsonutils @@ -377,6 +378,112 @@ def test_deferred_delete_force(self): # Wait for real deletion self._wait_until_deleted(found_server) + def test_unshelve_offloaded_overquota(self): + # Use a quota limit of 3 vcpus. + self.flags(cores=3, group='quota') + + # Use flavor that has vcpus = 1. + for i in range(0, 3): + server = self._create_server(flavor_id=1) + + # We should be at the quota limit now. Shelve an instance and wait for + # it to become SHELVED_OFFLOADED. + self._shelve_server(server, expected_state='SHELVED_OFFLOADED') + + # Try to boot another instance. It should fail because shelved + # offloaded instances still consume quota. + ex = self.assertRaises(client.OpenStackApiException, + self._create_server, + flavor_id=1) + self.assertEqual(403, ex.response.status_code) + + # Unshelving the instance should also succeed. + self._unshelve_server(server) + + def _test_unshelve_offloaded_overquota_placement(self): + # Use flavor that has vcpus = 1. + for i in range(0, 3): + server = self._create_server(flavor_id=1) + + # We should be at the quota limit now. Shelve an instance and wait for + # it to become SHELVED_OFFLOADED. + self._shelve_server(server, expected_state='SHELVED_OFFLOADED') + + # Try to boot another instance. It should succeed because with + # placement, shelved offloaded instances do not consume cores/ram + # quota. + self._create_server(flavor_id=1) + + # FIXME(melwitt): This is bug #2003991, the unshelve is supposed to + # fail if we would be over quota after unshelving. + # Now try to unshelve the earlier instance. It should fail because it + # would put us over quota to have 4 running instances. + # ex = self.assertRaises(client.OpenStackApiException, + # self._unshelve_server, + # server) + # self.assertEqual(403, ex.response.status_code) + self._unshelve_server(server) + + def test_unshelve_offloaded_overquota_placement(self): + # Count quota usage from placement. + self.flags(count_usage_from_placement=True, group='quota') + # Use a quota limit of 3 vcpus. + self.flags(cores=3, group='quota') + self._test_unshelve_offloaded_overquota_placement() + + def test_unshelve_offloaded_overquota_ul(self): + self.flags(driver='nova.quota.UnifiedLimitsDriver', group='quota') + limits = { + 'servers': 5, + 'class:VCPU': 3, + 'class:MEMORY_MB': 2048, + 'class:DISK_GB': 5 + } + self.useFixture(limit_fixture.LimitFixture(limits, {})) + self._test_unshelve_offloaded_overquota_placement() + + def test_unshelve_overquota(self): + # Test for behavior where the shelved instance is not offloaded. + self.flags(shelved_offload_time=3600) + # Use a quota limit of 3 vcpus. + self.flags(cores=3, group='quota') + + # Use flavor that has vcpus = 1. + for i in range(0, 3): + server = self._create_server(flavor_id=1) + + # We should be at the quota limit now. Shelve an instance. + self._shelve_server(server, expected_state='SHELVED') + + # Try to boot another instance. It should fail because shelved + # instances still consume quota. + ex = self.assertRaises(client.OpenStackApiException, + self._create_server, + flavor_id=1) + self.assertEqual(403, ex.response.status_code) + + # Verify that it's still SHELVED. + self._wait_for_state_change(server, 'SHELVED') + + # Unshelving the instance should also succeed. + self._unshelve_server(server) + + def test_unshelve_overquota_placement(self): + # Count quota usage from placement, should behave the same as legacy. + self.flags(count_usage_from_placement=True, group='quota') + self.test_unshelve_overquota() + + def test_unshelve_overquota_ul(self): + self.flags(driver='nova.quota.UnifiedLimitsDriver', group='quota') + limits = { + 'servers': 5, + 'class:VCPU': 3, + 'class:MEMORY_MB': 2048, + 'class:DISK_GB': 5 + } + self.useFixture(limit_fixture.LimitFixture(limits, {})) + self.test_unshelve_overquota_placement() + def test_create_server_with_metadata(self): # Creates a server with metadata. From 9e8456297681ad21680acd35718e3cb97f8458f2 Mon Sep 17 00:00:00 2001 From: melanie witt Date: Wed, 17 May 2023 03:04:49 +0000 Subject: [PATCH 31/73] Add debug logging when Instance raises OrphanedObjectError This logging would be helpful in debugging issues when OrphanedObjectError is raised by an instance. Currently, there is not a way to identify which instance is attempting to lazy-load a field while orphaned. Being able to locate the instance in the database could also help with recovery/cleanup when a problematic record is disrupting operation of a deployment. Change-Id: I093de2839c1bb7c949a0812e07b63de4cc5ed167 (cherry picked from commit e0fbb6fc06d3b08b938af2e36b11f04c57fe6954) (cherry picked from commit f32deaa617286e4b0dc2d01585ccb5ac821a571c) --- nova/objects/instance.py | 5 +++++ nova/tests/unit/objects/test_instance.py | 15 +++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/nova/objects/instance.py b/nova/objects/instance.py index fed1a7c58b2..9107c683a7f 100644 --- a/nova/objects/instance.py +++ b/nova/objects/instance.py @@ -1090,6 +1090,11 @@ def clear_numa_topology(self): def obj_load_attr(self, attrname): # NOTE(danms): We can't lazy-load anything without a context and a uuid if not self._context: + if 'uuid' in self: + LOG.debug( + "Lazy-load of '%s' attempted by orphaned instance", + attrname, instance=self + ) raise exception.OrphanedObjectError(method='obj_load_attr', objtype=self.obj_name()) if 'uuid' not in self: diff --git a/nova/tests/unit/objects/test_instance.py b/nova/tests/unit/objects/test_instance.py index 6215d2be60b..3ff83493537 100644 --- a/nova/tests/unit/objects/test_instance.py +++ b/nova/tests/unit/objects/test_instance.py @@ -1633,6 +1633,21 @@ def test_save_objectfield_reraises_if_not_instance_related(self): self._test_save_objectfield_fk_constraint_fails( 'other_foreign_key', db_exc.DBReferenceError) + @mock.patch('nova.objects.instance.LOG.debug') + def test_obj_load_attr_log(self, mock_log_debug): + # Instance with no UUID should not log. + instance = objects.Instance() + self.assertRaises( + exception.OrphanedObjectError, instance.obj_load_attr, 'foo') + mock_log_debug.assert_not_called() + # Instance with UUID should log. + instance = objects.Instance( + uuid='127a0d59-b88c-422b-b9a1-2dc7cc51fb9a') + self.assertRaises( + exception.OrphanedObjectError, instance.obj_load_attr, 'foo') + msg = "Lazy-load of '%s' attempted by orphaned instance" + mock_log_debug.assert_called_once_with(msg, 'foo', instance=instance) + class TestRemoteInstanceObject(test_objects._RemoteTest, _TestInstanceObject): From e5eb65e7a0a481a30332ea06e87d3c274dc1b046 Mon Sep 17 00:00:00 2001 From: Sean Mooney Date: Tue, 4 Jul 2023 16:42:08 +0100 Subject: [PATCH 32/73] enable validations in nova-lvm As of I8ca059a4702471d4d30ea5a06079859eba3f5a81 validations are now requried for test_rebuild_volume_backed_server. Validations are also required for any volume attach/detach based test in general due to know qemu issues. This patch just turns them back on to unblock the gate. Depends-On: https://review.opendev.org/c/openstack/devstack-plugin-ceph/+/888165 Depends-On: https://review.opendev.org/c/openstack/devstack/+/888228 Closes-Bug: #2025813 Change-Id: Ia198f712e2ad277743aed08e27e480208f463ac7 (cherry picked from commit 6f56c5c9fd60ee1d53376a9100a9580cb2b38dc3) (cherry picked from commit 976364f9e8f2ddb0e2cb5d8dc765c37ef833c837) --- .zuul.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.zuul.yaml b/.zuul.yaml index 0d489a5a82b..7656bacd865 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -262,8 +262,6 @@ NOVA_BACKEND: LVM # Do not waste time clearing volumes. LVM_VOLUME_CLEAR: none - # Disable SSH validation in tests to save time. - TEMPEST_RUN_VALIDATION: false # Increase the size of the swift loopback device to accommodate RAW # snapshots from the LV based instance disks. # See bug #1913451 for more details. From 53e3afe3a6024be6ead35132230dc468dd808850 Mon Sep 17 00:00:00 2001 From: melanie witt Date: Fri, 14 Jul 2023 02:23:23 +0000 Subject: [PATCH 33/73] Decorate only Flavor.get_* methods that execute queries The get_* methods on the Flavor object use a common helper method to build a query object to execute later. Currently, the @api_db_api.context_manager.reader decorator which manages the session is located on the helper method instead of on the methods that actually execute the database queries. Part of the context manager's job is to close the session after the query is executed. Because the decorator is not on the methods that actually execute the queries, those database connections are not being closed and it will eventually lead to errors like: sqlalchemy.exc.TimeoutError: QueuePool limit of size 5 overflow 50 reached, connection timed out, timeout 30.00 (Background on this error at: https://sqlalche.me/e/14/3o7r) which means the connection pool size plus the overflow size has been reached and the pool will block for a fixed period of time before timing out and raising this error. This removes the @api_db_api.context_manager.reader decorator from the query build helper method and adds it to the Flavor.get_* methods that execute the database queries. Closes-Bug: #2027755 Change-Id: I4bf83d1642b62ab103716aff6dae7438646e2b31 (cherry picked from commit 9ae6240c92e212b2fa96d5163f68ef1b30ee03b7) (cherry picked from commit 4f4f27be27cb202be6a8fb949ad21021a5cd735d) --- nova/objects/flavor.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/nova/objects/flavor.py b/nova/objects/flavor.py index 01eeb62331e..6d10b98559d 100644 --- a/nova/objects/flavor.py +++ b/nova/objects/flavor.py @@ -270,8 +270,9 @@ def _from_db_object(context, flavor, db_flavor, expected_attrs=None): return flavor @staticmethod - @api_db_api.context_manager.reader def _flavor_get_query_from_db(context): + # We don't use a database context decorator on this method because this + # method is not executing a query, it's only building one. query = context.session.query(api_models.Flavors).options( orm.joinedload(api_models.Flavors.extra_specs) ) @@ -285,6 +286,7 @@ def _flavor_get_query_from_db(context): @staticmethod @db_utils.require_context + @api_db_api.context_manager.reader def _flavor_get_from_db(context, id): """Returns a dict describing specific flavor.""" result = Flavor._flavor_get_query_from_db(context).\ @@ -296,6 +298,7 @@ def _flavor_get_from_db(context, id): @staticmethod @db_utils.require_context + @api_db_api.context_manager.reader def _flavor_get_by_name_from_db(context, name): """Returns a dict describing specific flavor.""" result = Flavor._flavor_get_query_from_db(context).\ @@ -307,6 +310,7 @@ def _flavor_get_by_name_from_db(context, name): @staticmethod @db_utils.require_context + @api_db_api.context_manager.reader def _flavor_get_by_flavor_id_from_db(context, flavor_id): """Returns a dict describing specific flavor_id.""" result = Flavor._flavor_get_query_from_db(context).\ From e13c86b4f320a0a040b558f2e207a911fc9f6127 Mon Sep 17 00:00:00 2001 From: Sylvain Bauza Date: Wed, 25 Oct 2023 10:51:21 +0200 Subject: [PATCH 34/73] add a regression test for all compute RPCAPI 6.x pinnings for rebuild We forgot that we automatically pin our RPC calls to the RPC version that the older compute supports, so when rolling-upgrading computes, we continue to use either Yoga or Zed versions for example when upgrading to 2023.1. Since the new parameters aren't optional, we broke the rebuild_instance() method then for Yoga to Zed and Zed to 2023.1. NOTE(elod.illes): test_rebuild_instance_6_1 test needed an update, as now we are on Zed branch, so zed to zed upgrade does not raise any Error, as we have the same parameters in the RPC call. Change-Id: Icf340f3d4c5ce0a4b7388003f168e7c479e58eee Related-Bug: #2040264 (cherry picked from commit 21fd0c430c714d21c52e0a0c996351c374a3e3d6) (cherry picked from commit eb310f3bd2f21efe0dd2bc6b133694a687e8f5ff) (cherry picked from commit a861b575081b31090ff9f89120b2247a7586acf8) --- .../regressions/test_bug_2040264.py | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 nova/tests/functional/regressions/test_bug_2040264.py diff --git a/nova/tests/functional/regressions/test_bug_2040264.py b/nova/tests/functional/regressions/test_bug_2040264.py new file mode 100644 index 00000000000..c6b911c7c87 --- /dev/null +++ b/nova/tests/functional/regressions/test_bug_2040264.py @@ -0,0 +1,63 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from nova.tests import fixtures as nova_fixtures +from nova.tests.functional.api import client +from nova.tests.functional import integrated_helpers + + +class ComputeVersion6xPinnedRpcTests(integrated_helpers._IntegratedTestBase): + + compute_driver = 'fake.MediumFakeDriver' + ADMIN_API = True + api_major_version = 'v2.1' + microversion = 'latest' + + def setUp(self): + super(ComputeVersion6xPinnedRpcTests, self).setUp() + self.useFixture(nova_fixtures.CastAsCallFixture(self)) + + self.compute1 = self._start_compute(host='host1') + + def _test_rebuild_instance_with_compute_rpc_pin(self, version_cap): + # Since passing the latest microversion (>= 2.93) passes + # the 'reimage_boot_volume' parameter as True and it is + # not acceptable with compute RPC version (required 6.1) + # These tests fail, so assigning microversion to 2.92 + self.api.microversion = '2.92' + self.flags(compute=version_cap, group='upgrade_levels') + + server_req = self._build_server(networks='none') + server = self.api.post_server({'server': server_req}) + server = self._wait_for_state_change(server, 'ACTIVE') + + self.api.post_server_action(server['id'], {'rebuild': { + 'imageRef': '155d900f-4e14-4e4c-a73d-069cbf4541e6' + }}) + + # We automatically pin to 6.0 if old computes are Yoga or older. + def test_rebuild_instance_6_0(self): + e = self.assertRaises(client.OpenStackApiException, + self._test_rebuild_instance_with_compute_rpc_pin, '6.0') + self.assertEqual(500, e.response.status_code) + # NOTE(sbauza): This returns a TypeError because of + # 'reimage_boot_volume' and 'target_state' parameters missing from the + # rcpapi caller. + self.assertIn('TypeError', e.response.text) + + # We automatically pin to 6.1 if old computes are Zed. + def test_rebuild_instance_6_1(self): + self._test_rebuild_instance_with_compute_rpc_pin('6.1') + + # We automatically pin to 6.2 if old computes are 2023.1. + def test_rebuild_instance_6_2(self): + self._test_rebuild_instance_with_compute_rpc_pin('6.2') From 1b9c4c7e64425196b5776154a0618c9e2a763be8 Mon Sep 17 00:00:00 2001 From: Sylvain Bauza Date: Wed, 25 Oct 2023 10:58:36 +0200 Subject: [PATCH 35/73] Fix rebuild compute RPC API exception for rolling-upgrades By I0d889691de1af6875603a9f0f174590229e7be18 we broke rebuild for Yoga or older computes. By I9660d42937ad62d647afc6be965f166cc5631392 we broke rebuild for Zed computes. Fixing this by making the parameters optional. Conflicts: nova/compute/manager.py NOTE(elod.illes): conflict is due to feature 'allowing target state for evacuate' I9660d42937ad62d647afc6be965f166cc5631392 was added in 2023.1 Antelope cycle. Change-Id: I0ca04045f8ac742e2b50490cbe5efccaee45c5c0 Closed-Bug: #2040264 (cherry picked from commit ee9ed0f7c6abf7c4847e6dc31f6d3d79b25b9d99) (cherry picked from commit 6b870ab90afe400ec82715e908afecbb00f0ed65) (cherry picked from commit edfb3975807b3eda4fae0ea07a3d99871ca87cae) --- nova/compute/manager.py | 4 ++-- nova/tests/functional/regressions/test_bug_2040264.py | 9 +-------- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index d216812a351..bc31d41f273 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -3637,7 +3637,7 @@ def rebuild_instance(self, context, instance, orig_image_ref, image_ref, bdms, recreate, on_shared_storage, preserve_ephemeral, migration, scheduled_node, limits, request_spec, accel_uuids, - reimage_boot_volume): + reimage_boot_volume=None): """Destroy and re-make this instance. A 'rebuild' effectively purges all existing data from the system and @@ -3671,7 +3671,7 @@ def rebuild_instance(self, context, instance, orig_image_ref, image_ref, :param accel_uuids: a list of cyborg ARQ uuids :param reimage_boot_volume: Boolean to specify whether the user has explicitly requested to rebuild a boot - volume + volume or None if RPC version is <=6.0 """ # recreate=True means the instance is being evacuated from a failed diff --git a/nova/tests/functional/regressions/test_bug_2040264.py b/nova/tests/functional/regressions/test_bug_2040264.py index c6b911c7c87..c0e88c1154b 100644 --- a/nova/tests/functional/regressions/test_bug_2040264.py +++ b/nova/tests/functional/regressions/test_bug_2040264.py @@ -11,7 +11,6 @@ # under the License. from nova.tests import fixtures as nova_fixtures -from nova.tests.functional.api import client from nova.tests.functional import integrated_helpers @@ -46,13 +45,7 @@ def _test_rebuild_instance_with_compute_rpc_pin(self, version_cap): # We automatically pin to 6.0 if old computes are Yoga or older. def test_rebuild_instance_6_0(self): - e = self.assertRaises(client.OpenStackApiException, - self._test_rebuild_instance_with_compute_rpc_pin, '6.0') - self.assertEqual(500, e.response.status_code) - # NOTE(sbauza): This returns a TypeError because of - # 'reimage_boot_volume' and 'target_state' parameters missing from the - # rcpapi caller. - self.assertIn('TypeError', e.response.text) + self._test_rebuild_instance_with_compute_rpc_pin('6.0') # We automatically pin to 6.1 if old computes are Zed. def test_rebuild_instance_6_1(self): From c36e0db95749395d5915b366fe6d36f516151c1a Mon Sep 17 00:00:00 2001 From: Alexey Stupnikov Date: Thu, 25 May 2023 21:23:32 +0200 Subject: [PATCH 36/73] Translate VF network capabilities to port binding Libvirt's node device driver accumulates and reports information about host devices. Network capabilities reported by node device driver for NIC contain information about HW offloads supported by this NIC. One of possible features reported by node device driver is switchdev: a NIC capability to implement VFs similar to actual HW switch ports (also referred to as SR-IOV OVS hardware offload). From Neutron perspective, vnic-type should be set to "direct" and "switchdev" capability should be added to port binding profile to enable HW offload (there are also configuration steps on compute hosts to tune NIC config). This patch was written to automatically translate "switchdev" from VF network capabilities reported by node device driver to Neutron port binding profile and allow user to skip manual step that requires admin privileges. Other capabilities are also translated: they are not used right now, but provide visibility and can be utilized later. Closes-bug: #2020813 Closes-bug: #2008238 Change-Id: I3b17f386325b8f42c0c374f766fb21c520161a59 (cherry picked from commit cef3b5ef2cc1fe983578e4966208cf95fdea5880) (cherry picked from commit 7e4f45df91f33fa8b75feec95e5636db06fda443) (cherry picked from commit 4fcc8c369f2c580f86dbfc6b1f812516f80262c0) --- nova/network/neutron.py | 7 +++++++ nova/objects/pci_device.py | 7 +++++++ nova/tests/fixtures/libvirt_data.py | 1 + nova/tests/unit/network/test_neutron.py | 7 +++++-- nova/tests/unit/objects/test_pci_device.py | 10 ++++++++++ nova/tests/unit/virt/libvirt/test_host.py | 2 +- ...ilities_to_port_binding-48abbfe0ce2923cf.yaml | 16 ++++++++++++++++ 7 files changed, 47 insertions(+), 3 deletions(-) create mode 100644 releasenotes/notes/translate_vf_network_capabilities_to_port_binding-48abbfe0ce2923cf.yaml diff --git a/nova/network/neutron.py b/nova/network/neutron.py index dd3e4de05db..c785e058475 100644 --- a/nova/network/neutron.py +++ b/nova/network/neutron.py @@ -1588,6 +1588,13 @@ def _get_vf_pci_device_profile(self, pci_dev): 'pf_mac_address': pf_mac, 'vf_num': vf_num, }) + + # Update port binding capabilities using PCI device's network + # capabilities if they exist. + pci_net_caps = pci_dev.network_caps + if pci_net_caps: + vf_profile.update({'capabilities': pci_net_caps}) + return vf_profile def _get_pci_device_profile(self, pci_dev): diff --git a/nova/objects/pci_device.py b/nova/objects/pci_device.py index 554d68feca2..71fe4b2fdba 100644 --- a/nova/objects/pci_device.py +++ b/nova/objects/pci_device.py @@ -588,6 +588,13 @@ def mac_address(self): """ return self.extra_info.get('mac_address') + @property + def network_caps(self): + """PCI device network capabilities or empty list if not available""" + caps_json = self.extra_info.get('capabilities', '{}') + caps = jsonutils.loads(caps_json) + return caps.get('network', []) + @base.NovaObjectRegistry.register class PciDeviceList(base.ObjectListBase, base.NovaObject): diff --git a/nova/tests/fixtures/libvirt_data.py b/nova/tests/fixtures/libvirt_data.py index f022860f615..f921a5e2df3 100644 --- a/nova/tests/fixtures/libvirt_data.py +++ b/nova/tests/fixtures/libvirt_data.py @@ -2182,6 +2182,7 @@ def fake_kvm_guest(): + """, # noqa:E501 diff --git a/nova/tests/unit/network/test_neutron.py b/nova/tests/unit/network/test_neutron.py index c551191e4cf..0789022cfae 100644 --- a/nova/tests/unit/network/test_neutron.py +++ b/nova/tests/unit/network/test_neutron.py @@ -8144,17 +8144,20 @@ def test__get_vf_pci_device_profile(self): 'pf_mac_address': '52:54:00:1e:59:c6', 'vf_num': 1, }, + 'network_caps': ['gso', 'sg', 'tso', 'tx'], 'dev_type': obj_fields.PciDeviceType.SRIOV_VF, } PciDevice = collections.namedtuple('PciDevice', ['vendor_id', 'product_id', 'address', 'card_serial_number', 'sriov_cap', - 'dev_type', 'parent_addr']) + 'dev_type', 'parent_addr', + 'network_caps']) mydev = PciDevice(**pci_dev) self.assertEqual(self.api._get_vf_pci_device_profile(mydev), {'pf_mac_address': '52:54:00:1e:59:c6', 'vf_num': 1, - 'card_serial_number': 'MT2113X00000'}) + 'card_serial_number': 'MT2113X00000', + 'capabilities': ['gso', 'sg', 'tso', 'tx']}) @mock.patch.object( neutronapi.API, '_get_vf_pci_device_profile', diff --git a/nova/tests/unit/objects/test_pci_device.py b/nova/tests/unit/objects/test_pci_device.py index 1e971c5a214..e0570b69a8e 100644 --- a/nova/tests/unit/objects/test_pci_device.py +++ b/nova/tests/unit/objects/test_pci_device.py @@ -171,6 +171,16 @@ def test_pci_device_extra_info_card_serial_number(self): self.pci_device = pci_device.PciDevice.create(None, self.dev_dict) self.assertEqual(self.pci_device.card_serial_number, '42') + def test_pci_device_extra_info_network_capabilities(self): + self.dev_dict = copy.copy(dev_dict) + self.pci_device = pci_device.PciDevice.create(None, self.dev_dict) + self.assertEqual(self.pci_device.network_caps, []) + + self.dev_dict = copy.copy(dev_dict) + self.dev_dict['capabilities'] = {'network': ['sg', 'tso', 'tx']} + self.pci_device = pci_device.PciDevice.create(None, self.dev_dict) + self.assertEqual(self.pci_device.network_caps, ['sg', 'tso', 'tx']) + def test_update_device(self): self.pci_device = pci_device.PciDevice.create(None, dev_dict) self.pci_device.obj_reset_changes() diff --git a/nova/tests/unit/virt/libvirt/test_host.py b/nova/tests/unit/virt/libvirt/test_host.py index 534d4489d70..3a8a04454b7 100644 --- a/nova/tests/unit/virt/libvirt/test_host.py +++ b/nova/tests/unit/virt/libvirt/test_host.py @@ -1337,7 +1337,7 @@ def test_get_pcidev_info(self): "parent_ifname": "ens1", "capabilities": { "network": ["rx", "tx", "sg", "tso", "gso", "gro", "rxvlan", - "txvlan", "rxhash"], + "txvlan", "rxhash", "switchdev"], "sriov": {"pf_mac_address": "52:54:00:1e:59:c6", "vf_num": 1}, # Should be obtained from the parent PF in this case. diff --git a/releasenotes/notes/translate_vf_network_capabilities_to_port_binding-48abbfe0ce2923cf.yaml b/releasenotes/notes/translate_vf_network_capabilities_to_port_binding-48abbfe0ce2923cf.yaml new file mode 100644 index 00000000000..b5ee283c8c4 --- /dev/null +++ b/releasenotes/notes/translate_vf_network_capabilities_to_port_binding-48abbfe0ce2923cf.yaml @@ -0,0 +1,16 @@ +--- +fixes: + - | + Previously ``switchdev`` capabilities should be configured manually by a + user with admin privileges using port's binding profile. This blocked + regular users from managing ports with Open vSwitch hardware offloading + as providing write access to a port's binding profile to non-admin users + introduces security risks. For example, a binding profile may contain a + ``pci_slot`` definition, which denotes the host PCI address of the + device attached to the VM. A malicious user can use this parameter to + passthrough any host device to a guest, so it is impossible to provide + write access to a binding profile to regular users in many scenarios. + + This patch fixes this situation by translating VF capabilities reported + by Libvirt to Neutron port binding profiles. Other VF capabilities are + translated as well for possible future use. From 787839f6637f292fb5656725e5dae12fbe6e3c3e Mon Sep 17 00:00:00 2001 From: Sivasathurappan Radhakrishnan Date: Fri, 10 Mar 2017 22:16:42 +0000 Subject: [PATCH 37/73] Allow live migrate paused instance when post copy is enabled Live migration of paused instance fails when VIR_MIGRATE_POSTCOPY flag is set. In this patch, the flag is unset to permit live migration of paused instance. Change-Id: Ib5cbc948cb953e35a22bcbb859976f0afddcb662 Closes-Bug: #1671011 (cherry picked from commit 33fa92b6cb1dfeb88a4188c0e4e4ce51be1f7a4b) (cherry picked from commit 989ee448906f4ca623f6b14ef8c7af8c8fa10e2f) (cherry picked from commit 5db7a7b3db2c394b4cda095dfe3a0a72f8106466) --- nova/tests/unit/virt/libvirt/test_driver.py | 33 +++++++++++++++++++-- nova/virt/libvirt/driver.py | 11 +++++++ 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index caf35414782..8a00c9b88d2 100644 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -694,6 +694,7 @@ def _create_test_instance(): 'numa_topology': None, 'config_drive': None, 'vm_mode': None, + 'vm_state': None, 'kernel_id': None, 'ramdisk_id': None, 'os_type': 'linux', @@ -12170,7 +12171,7 @@ def test_live_migration_update_volume_xml(self, mock_xml, mock_updated_guest_xml, mock_migrateToURI3): self.compute = manager.ComputeManager() - instance_ref = self.test_instance + instance_ref = objects.Instance(**self.test_instance) target_connection = '127.0.0.2' target_xml = self.device_xml_tmpl.format( @@ -12339,7 +12340,7 @@ def test_live_migration_with_valid_target_connect_addr(self, mock_xml, mock_migrateToURI3, mock_min_version): self.compute = manager.ComputeManager() - instance_ref = self.test_instance + instance_ref = objects.Instance(**self.test_instance) target_connection = '127.0.0.2' target_xml = self.device_xml_tmpl.format( @@ -12954,6 +12955,33 @@ def test_block_live_migration_tunnelled_migrateToURI3( drvr._live_migration_uri(target_connection), params=params, flags=expected_flags) + @mock.patch.object(host.Host, 'has_min_version', return_value=True) + @mock.patch.object(fakelibvirt.virDomain, "migrateToURI3") + @mock.patch('nova.virt.libvirt.migration.get_updated_guest_xml', + return_value='') + def test_live_migration_paused_instance_postcopy(self, mock_new_xml, + mock_migrateToURI3, + mock_min_version): + disk_paths = [] + params = {'bandwidth': CONF.libvirt.live_migration_bandwidth} + migrate_data = objects.LibvirtLiveMigrateData(block_migration=False, + serial_listen_addr=False) + dom = fakelibvirt.virDomain + guest = libvirt_guest.Guest(dom) + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + drvr._parse_migration_flags() + instance = objects.Instance(**self.test_instance) + instance.vm_state = vm_states.PAUSED + + drvr._live_migration_operation(self.context, instance, 'dest', + True, migrate_data, guest, + disk_paths) + + # Verify VIR_MIGRATE_POSTCOPY flag was not set + self.assertEqual(drvr._live_migration_flags, 27) + mock_migrateToURI3.assert_called_once_with( + drvr._live_migration_uri('dest'), params=params, flags=27) + @mock.patch.object(host.Host, 'has_min_version', return_value=True) @mock.patch.object(fakelibvirt.virDomain, "migrateToURI3") @mock.patch('nova.virt.libvirt.migration.get_updated_guest_xml', @@ -12963,7 +12991,6 @@ def test_block_live_migration_native_tls( self, mock_old_xml, mock_new_xml, mock_migrateToURI3, mock_min_version): self.flags(live_migration_with_native_tls=True, group='libvirt') - target_connection = None disk_paths = ['vda', 'vdb'] diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index 65058c7dfa1..2b9e6cb984a 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -10115,6 +10115,17 @@ def _live_migration_operation(self, context, instance, dest, else: migration_flags = self._live_migration_flags + # Note(siva_krishnan): live migrating paused instance fails + # when VIR_MIGRATE_POSTCOPY flag is set. It is unset here + # to permit live migration of paused instance. + if ( + instance.vm_state == vm_states.PAUSED and + self._is_post_copy_enabled(migration_flags) + ): + LOG.debug('Post-copy flag unset because instance is paused.', + instance=instance) + migration_flags ^= libvirt.VIR_MIGRATE_POSTCOPY + if not migrate_data.serial_listen_addr: # In this context we want to ensure that serial console is # disabled on source node. This is because nova couldn't From 7422642dd6e222959470b73dc6ba2450a792b9c7 Mon Sep 17 00:00:00 2001 From: Bence Romsics Date: Mon, 14 Aug 2023 13:03:13 +0200 Subject: [PATCH 38/73] Reproduce bug #2025480 in a functional test Written by gibi, I just cleaned it up. Change-Id: I8386a846b3685b8d03c59334ccfb2efbd4afe427 Co-Authored-By: Balazs Gibizer Related-Bug: #2025480 (cherry picked from commit 62300d4885549368f874b3e07b756017ff96c659) (cherry picked from commit 477ff2667d7ecd218fa5163d86d2719979dcdcd3) (cherry picked from commit 23c190a35839b396418d3e98af1e67587f9e9296) --- .../regressions/test_bug_2025480.py | 87 +++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 nova/tests/functional/regressions/test_bug_2025480.py diff --git a/nova/tests/functional/regressions/test_bug_2025480.py b/nova/tests/functional/regressions/test_bug_2025480.py new file mode 100644 index 00000000000..f6c87109f79 --- /dev/null +++ b/nova/tests/functional/regressions/test_bug_2025480.py @@ -0,0 +1,87 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +from unittest import mock + +from nova import context +from nova.objects import compute_node +from nova import test +from nova.tests import fixtures as nova_fixtures +from nova.tests.functional import fixtures as func_fixtures +from nova.tests.functional import integrated_helpers + + +class UnshelveUpdateAvailableResourcesPeriodicRace( + test.TestCase, integrated_helpers.InstanceHelperMixin): + def setUp(self): + super(UnshelveUpdateAvailableResourcesPeriodicRace, self).setUp() + + placement = func_fixtures.PlacementFixture() + self.useFixture(placement) + self.placement = placement.api + self.neutron = nova_fixtures.NeutronFixture(self) + self.useFixture(self.neutron) + self.useFixture(nova_fixtures.GlanceFixture(self)) + # Start nova services. + self.api = self.useFixture(nova_fixtures.OSAPIFixture( + api_version='v2.1')).admin_api + self.api.microversion = 'latest' + self.notifier = self.useFixture( + nova_fixtures.NotificationFixture(self)) + + self.start_service('conductor') + self.start_service('scheduler') + + def test_unshelve_spawning_update_available_resources(self): + compute = self._start_compute('compute1') + + server = self._create_server( + networks=[{'port': self.neutron.port_1['id']}]) + + node = compute_node.ComputeNode.get_by_nodename( + context.get_admin_context(), 'compute1') + self.assertEqual(1, node.vcpus_used) + + # with default config shelve means immediate offload as well + req = { + 'shelve': {} + } + self.api.post_server_action(server['id'], req) + self._wait_for_server_parameter( + server, {'status': 'SHELVED_OFFLOADED', + 'OS-EXT-SRV-ATTR:host': None}) + + node = compute_node.ComputeNode.get_by_nodename( + context.get_admin_context(), 'compute1') + self.assertEqual(0, node.vcpus_used) + + def fake_spawn(*args, **kwargs): + self._run_periodics() + + with mock.patch.object( + compute.driver, 'spawn', side_effect=fake_spawn): + req = {'unshelve': None} + self.api.post_server_action(server['id'], req) + self.notifier.wait_for_versioned_notifications( + 'instance.unshelve.start') + self._wait_for_server_parameter( + server, + { + 'status': 'ACTIVE', + 'OS-EXT-STS:task_state': None, + 'OS-EXT-SRV-ATTR:host': 'compute1', + }) + + node = compute_node.ComputeNode.get_by_nodename( + context.get_admin_context(), 'compute1') + # This is the bug, the instance should have resources claimed + # self.assertEqual(1, node.vcpus_used) + self.assertEqual(0, node.vcpus_used) From e41962f5fa59c47e63468945e82c2e7164c24c38 Mon Sep 17 00:00:00 2001 From: Bence Romsics Date: Wed, 2 Aug 2023 16:22:55 +0200 Subject: [PATCH 39/73] Do not untrack resources of a server being unshelved This patch concerns the time when a VM is being unshelved and the compute manager set the task_state to spawning, claimed resources of the VM and then called driver.spawn(). So the instance is in vm_state SHELVED_OFFLOADED, task_state spawning. If at this point a new update_available_resource periodic job is started that collects all the instances assigned to the node to calculate resource usage. However the calculation assumed that a VM in SHELVED_OFFLOADED state does not need resource allocation on the node (probably being removed from the node as it is offloaded) and deleted the resource claim. Given all this we ended up with the VM spawned successfully but having lost the resource claim on the node. This patch changes what we do in vm_state SHELVED_OFFLOADED, task_state spawning. We no longer delete the resource claim in this state and keep tracking the resource in stats. Conflicts: nova/compute/vm_states.py conflict with 8c2e76598995f0d417653c60a63ea342baf4e880 Change-Id: I8c9944810c09d501a6d3f60f095d9817b756872d Closes-Bug: #2025480 (cherry picked from commit f1dc4ec39bcfda1bd4b97e233a9da498b6378c4f) (cherry picked from commit 4239d1fec2814c074482b740a2fd38a5d5ce6942) (cherry picked from commit 683ecc060e3bca818b9fb514d297e323bc8cb220) --- nova/compute/manager.py | 6 +++--- nova/compute/resource_tracker.py | 7 +++++-- nova/compute/stats.py | 3 ++- nova/compute/vm_states.py | 11 +++++++++-- .../functional/regressions/test_bug_2025480.py | 5 ++--- nova/tests/unit/compute/test_stats.py | 16 ++++++++++++++++ 6 files changed, 37 insertions(+), 11 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index d216812a351..e780eb97772 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -6809,9 +6809,9 @@ def _shelve_offload_instance(self, context, instance, clean_shutdown, instance.power_state = current_power_state # NOTE(mriedem): The vm_state has to be set before updating the - # resource tracker, see vm_states.ALLOW_RESOURCE_REMOVAL. The host/node - # values cannot be nulled out until after updating the resource tracker - # though. + # resource tracker, see vm_states.allow_resource_removal(). The + # host/node values cannot be nulled out until after updating the + # resource tracker though. instance.vm_state = vm_states.SHELVED_OFFLOADED instance.task_state = None instance.save(expected_task_state=[task_states.SHELVING, diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py index ffbc7ed03fa..9bb05b49104 100644 --- a/nova/compute/resource_tracker.py +++ b/nova/compute/resource_tracker.py @@ -1546,7 +1546,8 @@ def _update_usage_from_instance(self, context, instance, nodename, # NOTE(sfinucan): Both brand new instances as well as instances that # are being unshelved will have is_new_instance == True is_removed_instance = not is_new_instance and (is_removed or - instance['vm_state'] in vm_states.ALLOW_RESOURCE_REMOVAL) + vm_states.allow_resource_removal( + vm_state=instance['vm_state'], task_state=instance.task_state)) if is_new_instance: self.tracked_instances.add(uuid) @@ -1605,7 +1606,9 @@ def _update_usage_from_instances(self, context, instances, nodename): instance_by_uuid = {} for instance in instances: - if instance.vm_state not in vm_states.ALLOW_RESOURCE_REMOVAL: + if not vm_states.allow_resource_removal( + vm_state=instance['vm_state'], + task_state=instance.task_state): self._update_usage_from_instance(context, instance, nodename) instance_by_uuid[instance.uuid] = instance return instance_by_uuid diff --git a/nova/compute/stats.py b/nova/compute/stats.py index cfbee2e6bc1..e9180ec6d6d 100644 --- a/nova/compute/stats.py +++ b/nova/compute/stats.py @@ -105,7 +105,8 @@ def update_stats_for_instance(self, instance, is_removed=False): (vm_state, task_state, os_type, project_id) = \ self._extract_state_from_instance(instance) - if is_removed or vm_state in vm_states.ALLOW_RESOURCE_REMOVAL: + if is_removed or vm_states.allow_resource_removal( + vm_state=vm_state, task_state=task_state): self._decrement("num_instances") self.states.pop(uuid) else: diff --git a/nova/compute/vm_states.py b/nova/compute/vm_states.py index 633894c1ea4..1c4da06d155 100644 --- a/nova/compute/vm_states.py +++ b/nova/compute/vm_states.py @@ -27,6 +27,7 @@ See http://wiki.openstack.org/VMState """ +from nova.compute import task_states from nova.objects import fields @@ -74,5 +75,11 @@ # states we allow to trigger crash dump ALLOW_TRIGGER_CRASH_DUMP = [ACTIVE, PAUSED, RESCUED, RESIZED, ERROR] -# states we allow resources to be freed in -ALLOW_RESOURCE_REMOVAL = [DELETED, SHELVED_OFFLOADED] + +def allow_resource_removal(vm_state, task_state=None): + """(vm_state, task_state) combinations we allow resources to be freed in""" + + return ( + vm_state == DELETED or + vm_state == SHELVED_OFFLOADED and task_state != task_states.SPAWNING + ) diff --git a/nova/tests/functional/regressions/test_bug_2025480.py b/nova/tests/functional/regressions/test_bug_2025480.py index f6c87109f79..c707a40a846 100644 --- a/nova/tests/functional/regressions/test_bug_2025480.py +++ b/nova/tests/functional/regressions/test_bug_2025480.py @@ -82,6 +82,5 @@ def fake_spawn(*args, **kwargs): node = compute_node.ComputeNode.get_by_nodename( context.get_admin_context(), 'compute1') - # This is the bug, the instance should have resources claimed - # self.assertEqual(1, node.vcpus_used) - self.assertEqual(0, node.vcpus_used) + # After the fix, the instance should have resources claimed + self.assertEqual(1, node.vcpus_used) diff --git a/nova/tests/unit/compute/test_stats.py b/nova/tests/unit/compute/test_stats.py index e713794a19a..b95475f09db 100644 --- a/nova/tests/unit/compute/test_stats.py +++ b/nova/tests/unit/compute/test_stats.py @@ -208,6 +208,22 @@ def test_update_stats_for_instance_offloaded(self): self.assertEqual(0, self.stats.num_os_type("Linux")) self.assertEqual(0, self.stats["num_vm_" + vm_states.BUILDING]) + def test_update_stats_for_instance_being_unshelved(self): + instance = self._create_instance() + self.stats.update_stats_for_instance(instance) + self.assertEqual(1, self.stats.num_instances_for_project("1234")) + + instance["vm_state"] = vm_states.SHELVED_OFFLOADED + instance["task_state"] = task_states.SPAWNING + self.stats.update_stats_for_instance(instance) + + self.assertEqual(1, self.stats.num_instances) + self.assertEqual(1, self.stats.num_instances_for_project(1234)) + self.assertEqual(1, self.stats["num_os_type_Linux"]) + self.assertEqual(1, self.stats["num_vm_%s" % + vm_states.SHELVED_OFFLOADED]) + self.assertEqual(1, self.stats["num_task_%s" % task_states.SPAWNING]) + def test_io_workload(self): vms = [vm_states.ACTIVE, vm_states.BUILDING, vm_states.PAUSED] tasks = [task_states.RESIZE_MIGRATING, task_states.REBUILDING, From 2350c179f9db133f9d7c0b2c12e495195ff1b32a Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Mon, 6 Nov 2023 21:15:11 +0100 Subject: [PATCH 40/73] Remove outdated comment about allocation ratios Setting allocation ratios via host aggregate metadata has not been possible since Ocata. Change-Id: Ia1a14a56739ec4c3f6930f1947a7423fd447fe34 (cherry picked from commit 88068781d494b263a875b44bcbb6dcd5a98671f3) (cherry picked from commit b81439e2a2e9d965c3447ebd243043afb6c031a3) (cherry picked from commit 698421064b4604087634f8ea219795dad0b4928c) --- doc/source/admin/scheduling.rst | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/doc/source/admin/scheduling.rst b/doc/source/admin/scheduling.rst index e0e5b7188b3..5296cd95f58 100644 --- a/doc/source/admin/scheduling.rst +++ b/doc/source/admin/scheduling.rst @@ -1112,10 +1112,9 @@ scheduling. Usage scenarios ~~~~~~~~~~~~~~~ -Since allocation ratios can be set via nova configuration, host aggregate -metadata and the placement API, it can be confusing to know which should be -used. This really depends on your scenario. A few common scenarios are detailed -here. +Since allocation ratios can be set via nova configuration and the placement +API, it can be confusing to know which should be used. This really depends on +your scenario. A few common scenarios are detailed here. 1. When the deployer wants to **always** set an override value for a resource on a compute node, the deployer should ensure that the From 3ef339a383e7e23085a582260badb347d96d8935 Mon Sep 17 00:00:00 2001 From: Artom Lifshitz Date: Tue, 31 Oct 2023 22:52:50 -0400 Subject: [PATCH 41/73] libvirt: Stop unconditionally enabling evmcs In I008841988547573878c4e06e82f0fa55084e51b5 we started enabling a bunch of libvirt enlightenments for Windows unconditionally. Turns out, the `evmcs` enlightenment only works on Intel hosts, and we broke the ability to run Windows guests on AMD machines. Until we become smarter about conditionally enabling evmcs (with something like traits for host CPU features), just stop enabling it at all. Change-Id: I2ff4fdecd9dc69de283f0e52e07df1aeaf0a9048 Closes-bug: 2009280 (cherry picked from commit 86a35e97d286cbb6e23f8cc7bec5a05f022da0cb) (cherry picked from commit 0b7a59ad2812c3501332155211b18f224cd55bca) (cherry picked from commit 2bffc53ce8fb5239cb9cbd59f3bbd51577da0c4a) --- nova/tests/unit/virt/libvirt/test_driver.py | 5 ++++- nova/virt/libvirt/driver.py | 1 - ...p-unconditionally-enabling-evmcs-993a825641c4b9f3.yaml | 8 ++++++++ 3 files changed, 12 insertions(+), 2 deletions(-) create mode 100644 releasenotes/notes/libvirt-enlightenments-stop-unconditionally-enabling-evmcs-993a825641c4b9f3.yaml diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index 8a00c9b88d2..e5f672b7040 100644 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -27660,7 +27660,10 @@ def test_set_features_windows(self): self.assertTrue(hv.reenlightenment) self.assertTrue(hv.tlbflush) self.assertTrue(hv.ipi) - self.assertTrue(hv.evmcs) + # NOTE(artom) evmcs only works on Intel hosts, so we can't enable it + # unconditionally. Until we become smarter about it, just don't enable + # it at all. See bug 2009280. + self.assertFalse(hv.evmcs) class LibvirtVolumeUsageTestCase(test.NoDBTestCase): diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index 2b9e6cb984a..a4735c1d334 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -6167,7 +6167,6 @@ def _set_features(self, guest, os_type, image_meta, flavor): hv.reenlightenment = True hv.tlbflush = True hv.ipi = True - hv.evmcs = True # NOTE(kosamara): Spoofing the vendor_id aims to allow the nvidia # driver to work on windows VMs. At the moment, the nvidia driver diff --git a/releasenotes/notes/libvirt-enlightenments-stop-unconditionally-enabling-evmcs-993a825641c4b9f3.yaml b/releasenotes/notes/libvirt-enlightenments-stop-unconditionally-enabling-evmcs-993a825641c4b9f3.yaml new file mode 100644 index 00000000000..31609f2a2d6 --- /dev/null +++ b/releasenotes/notes/libvirt-enlightenments-stop-unconditionally-enabling-evmcs-993a825641c4b9f3.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + Bug 2009280 has been fixed by no longer enabling the evmcs enlightenment in + the libvirt driver. evmcs only works on Intel CPUs, and domains with that + enlightenment cannot be started on AMD hosts. There is a possible future + feature to enable support for generating this enlightenment only when + running on Intel hosts. From 201bfa5bd21e3436b5882880530d5937b7dca61c Mon Sep 17 00:00:00 2001 From: songjie Date: Mon, 25 Dec 2023 16:59:36 +0800 Subject: [PATCH 42/73] libvirt: stop enabling hyperv feature reenlightenment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 'reenlightenment' hyperv enlightenment will cause instances live-migration to fail (KVM currently doesn’t fully support reenlightenment notifications, see www.qemu.org/docs/master/system/i386/hyperv.html), so don't enable it now. Change-Id: I6821819450bc96e4304125ea3b76a0e462e6e33f Closes-Bug: #2046549 Related-Bug: #2009280 (cherry picked from commit e618e78edc6293d248a5fa2eb63b3fa636250fca) (cherry picked from commit 436e525a970bb991e147a0b4c3ce389a815252c2) (cherry picked from commit 47271bb705de445ca5e17ba5e3d503cade2a0959) --- nova/tests/unit/virt/libvirt/test_driver.py | 4 +++- nova/virt/libvirt/driver.py | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index 8a00c9b88d2..733287dbc3c 100644 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -27657,7 +27657,9 @@ def test_set_features_windows(self): self.assertTrue(hv.synic) self.assertTrue(hv.reset) self.assertTrue(hv.frequencies) - self.assertTrue(hv.reenlightenment) + # NOTE(jie) reenlightenment will cause instances live-migration + # failure, so don't enable it now. See bug 2046549. + self.assertFalse(hv.reenlightenment) self.assertTrue(hv.tlbflush) self.assertTrue(hv.ipi) self.assertTrue(hv.evmcs) diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index 2b9e6cb984a..593dc0edf78 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -6164,7 +6164,6 @@ def _set_features(self, guest, os_type, image_meta, flavor): hv.synic = True hv.reset = True hv.frequencies = True - hv.reenlightenment = True hv.tlbflush = True hv.ipi = True hv.evmcs = True From 85083f7f266017411dd820527ba42413b53920ed Mon Sep 17 00:00:00 2001 From: OpenStack Release Bot Date: Tue, 30 Apr 2024 15:00:28 +0000 Subject: [PATCH 43/73] [stable-only] Update .gitreview for unmaintained/zed Change-Id: I2cb5d6fa0ba10b890b19c666a68d94fe0a23c2f0 --- .gitreview | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitreview b/.gitreview index bc0f313915b..766107b1b64 100644 --- a/.gitreview +++ b/.gitreview @@ -2,4 +2,4 @@ host=review.opendev.org port=29418 project=openstack/nova.git -defaultbranch=stable/zed +defaultbranch=unmaintained/zed From bae6152d4269bd0b3f5fd8ea4f0abe483a6d3061 Mon Sep 17 00:00:00 2001 From: Jay Faulkner Date: Mon, 13 Nov 2023 15:21:31 -0800 Subject: [PATCH 44/73] [ironic] Partition & use cache for list_instance* list_instances and list_instance_uuids, as written in the Ironic driver, do not currently respect conductor_group paritioning. Given a nova compute is intended to limit it's scope of work to the conductor group it is configured to work with; this is a bug. Additionally, this should be a significant performance boost for a couple of reasons; firstly, instead of calling the Ironic API and getting all nodes, instead of the subset (when using conductor group), we're now properly getting the subset of nodes -- this is the optimized path in the Ironic DB and API code. Secondly, we're now using the driver's node cache to respond to these requests. Since list_instances and list_instance_uuids is used by periodic tasks, these operating with data that may be slightly stale should have minimal impact compared to the performance benefits. Closes-bug: #2043036 Change-Id: If31158e3269e5e06848c29294fdaa147beedb5a5 (cherry picked from commit fa3cf7d50cba921ea67eb161e6a199067ea62deb) (cherry picked from commit 555d7d0ad02d31476e2d751aa52be4087878f1a2) (cherry picked from commit 3226318b534847a5c553f2d94f9df19a631fd617) --- nova/tests/unit/virt/ironic/test_driver.py | 65 ++++++++----------- nova/virt/ironic/driver.py | 33 +++++++--- ...espect-partition-key-339ff653eaa00753.yaml | 5 ++ 3 files changed, 54 insertions(+), 49 deletions(-) create mode 100644 releasenotes/notes/ironic-list-instance-respect-partition-key-339ff653eaa00753.yaml diff --git a/nova/tests/unit/virt/ironic/test_driver.py b/nova/tests/unit/virt/ironic/test_driver.py index 52aa37ac134..ea40d73f599 100644 --- a/nova/tests/unit/virt/ironic/test_driver.py +++ b/nova/tests/unit/virt/ironic/test_driver.py @@ -582,71 +582,58 @@ def test__get_node_list_fail(self): @mock.patch.object(objects.Instance, 'get_by_uuid') def test_list_instances(self, mock_inst_by_uuid): - nodes = [] + nodes = {} instances = [] for i in range(2): uuid = uuidutils.generate_uuid() + node_uuid = uuidutils.generate_uuid() instances.append(fake_instance.fake_instance_obj(self.ctx, id=i, uuid=uuid)) - nodes.append(ironic_utils.get_test_node(instance_id=uuid, - fields=['instance_id'])) + nodes[node_uuid] = ironic_utils.get_test_node( + id=node_uuid, instance_id=uuid, fields=('instance_id',)) mock_inst_by_uuid.side_effect = instances - self.mock_conn.nodes.return_value = iter(nodes) + self.driver.node_cache = nodes response = self.driver.list_instances() - self.mock_conn.nodes.assert_called_with(associated=True, - fields=['instance_uuid']) expected_calls = [mock.call(mock.ANY, instances[0].uuid), mock.call(mock.ANY, instances[1].uuid)] mock_inst_by_uuid.assert_has_calls(expected_calls) self.assertEqual(['instance-00000000', 'instance-00000001'], sorted(response)) - # NOTE(dustinc) This test ensures we use instance_uuid not instance_id in - # 'fields' when calling ironic. + @mock.patch.object(ironic_driver.IronicDriver, '_refresh_cache') @mock.patch.object(objects.Instance, 'get_by_uuid') - def test_list_instances_uses_instance_uuid(self, mock_inst_by_uuid): - self.driver.list_instances() - - self.mock_conn.nodes.assert_called_with(associated=True, - fields=['instance_uuid']) - - @mock.patch.object(objects.Instance, 'get_by_uuid') - def test_list_instances_fail(self, mock_inst_by_uuid): - self.mock_conn.nodes.side_effect = exception.NovaException + def test_list_instances_fail(self, mock_inst_by_uuid, mock_cache): + mock_cache.side_effect = exception.VirtDriverNotReady self.assertRaises(exception.VirtDriverNotReady, self.driver.list_instances) - self.mock_conn.nodes.assert_called_with(associated=True, - fields=['instance_uuid']) self.assertFalse(mock_inst_by_uuid.called) def test_list_instance_uuids(self): num_nodes = 2 - nodes = [] + nodes = {} for n in range(num_nodes): - nodes.append(ironic_utils.get_test_node( - instance_id=uuidutils.generate_uuid(), - fields=['instance_id'])) - self.mock_conn.nodes.return_value = iter(nodes) + node_uuid = uuidutils.generate_uuid() + instance_uuid = uuidutils.generate_uuid() + nodes[instance_uuid] = ironic_utils.get_test_node( + id=node_uuid, + instance_id=instance_uuid, + fields=('instance_id',)) + self.driver.node_cache = nodes + instance_uuids = self.driver.list_instance_uuids() + expected = nodes.keys() + + self.assertEqual(sorted(expected), sorted(instance_uuids)) + + @mock.patch.object(ironic_driver.IronicDriver, '_refresh_cache') + def test_list_instance_uuids_fail(self, mock_cache): + mock_cache.side_effect = exception.VirtDriverNotReady - uuids = self.driver.list_instance_uuids() - - self.mock_conn.nodes.assert_called_with(associated=True, - fields=['instance_uuid']) - expected = [n.instance_id for n in nodes] - self.assertEqual(sorted(expected), sorted(uuids)) - - # NOTE(dustinc) This test ensures we use instance_uuid not instance_id in - # 'fields' when calling ironic. - @mock.patch.object(objects.Instance, 'get_by_uuid') - def test_list_instance_uuids_uses_instance_uuid(self, mock_inst_by_uuid): - self.driver.list_instance_uuids() - - self.mock_conn.nodes.assert_called_with(associated=True, - fields=['instance_uuid']) + self.assertRaises(exception.VirtDriverNotReady, + self.driver.list_instance_uuids) @mock.patch.object(objects.InstanceList, 'get_uuids_by_host') @mock.patch.object(objects.ServiceList, 'get_all_computes_by_hv_type') diff --git a/nova/virt/ironic/driver.py b/nova/virt/ironic/driver.py index 117294ff897..9d49b96b0e1 100644 --- a/nova/virt/ironic/driver.py +++ b/nova/virt/ironic/driver.py @@ -647,13 +647,21 @@ def list_instances(self): :raises: VirtDriverNotReady """ - # NOTE(dustinc): The SDK returns an object with instance_id, - # but the Ironic API expects instance_uuid in query. + # NOTE(JayF): As of this writing, November 2023, this is only called + # one place; in compute/manager.py, and only if + # list_instance_uuids is not implemented. This means that + # this is effectively dead code in the Ironic driver. + if not self.node_cache: + # Empty cache, try to populate it. If we cannot populate it, this + # is OK. This information is only used to cleanup deleted nodes; + # if Ironic has no deleted nodes; we're good. + self._refresh_cache() + context = nova_context.get_admin_context() - return [objects.Instance.get_by_uuid(context, i.instance_id).name - for i in self._get_node_list(return_generator=True, - associated=True, - fields=['instance_uuid'])] + + return [objects.Instance.get_by_uuid(context, node.instance_id).name + for node in self.node_cache.values() + if node.instance_id is not None] def list_instance_uuids(self): """Return the IDs of all the instances provisioned. @@ -662,10 +670,15 @@ def list_instance_uuids(self): :raises: VirtDriverNotReady """ - # NOTE(dustinc): The SDK returns an object with instance_id, - # but the Ironic API expects instance_uuid in query. - return [node.instance_id for node in self._get_node_list( - return_generator=True, associated=True, fields=['instance_uuid'])] + if not self.node_cache: + # Empty cache, try to populate it. If we cannot populate it, this + # is OK. This information is only used to cleanup deleted nodes; + # if Ironic has no deleted nodes; we're good. + self._refresh_cache() + + return [node.instance_id + for node in self.node_cache.values() + if node.instance_id is not None] def node_is_available(self, nodename): """Confirms a Nova hypervisor node exists in the Ironic inventory. diff --git a/releasenotes/notes/ironic-list-instance-respect-partition-key-339ff653eaa00753.yaml b/releasenotes/notes/ironic-list-instance-respect-partition-key-339ff653eaa00753.yaml new file mode 100644 index 00000000000..96f2a12b8ef --- /dev/null +++ b/releasenotes/notes/ironic-list-instance-respect-partition-key-339ff653eaa00753.yaml @@ -0,0 +1,5 @@ +fixes: + - Ironic virt driver now uses the node cache and respects partition keys, + such as conductor group, for list_instances and list_instance_uuids calls. + This fix will improve performance of the periodic queries which use these + driver methods and reduce API and DB load on the backing Ironic service. From fefc1dd84a1687970329809da774fee1e0673173 Mon Sep 17 00:00:00 2001 From: Elod Illes Date: Tue, 25 Jun 2024 15:57:48 +0200 Subject: [PATCH 45/73] [CI] Use zed-last from tempest for ceph multistore job nova-ceph-multistore job is broken on zed branch. This patch sets to use zed-last tagged version of cinder-tempest-plugin. Also exclude 5 test cases that are constantly failing recently. Change-Id: I8a29ca56d68703b65f77b75a7f14a9657467ca2e --- .zuul.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.zuul.yaml b/.zuul.yaml index 7656bacd865..550332abc7d 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -591,9 +591,21 @@ irrelevant-files: *nova-base-irrelevant-files required-projects: - openstack/nova + - name: openstack/cinder-tempest-plugin + override-checkout: zed-last pre-run: - playbooks/ceph/glance-copy-policy.yaml vars: + # NOTE(elod.illes): this job started to break with the following five + # test cases, somewhere around merging cinder-tempest-plugin patch + # I281f881ad565e565839522ddf02057f7545c7146 so let's just exclude + # them to unblock the gate. + tempest_exclude_regex: "\ + (test_delete_dep_chain)|\ + (test_delete_dep_chain_2)|\ + (test_delete_source_snapshot)|\ + (test_delete_source_volume)|\ + (test_nova_image_snapshot_dependency)" # NOTE(danms): These tests create an empty non-raw image, which nova # will refuse because we set never_download_image_if_on_rbd in this job. # Just skip these tests for this case. From d69d441cf5d82f69d8ed7d555a6af73624866400 Mon Sep 17 00:00:00 2001 From: Dan Smith Date: Mon, 1 Apr 2024 07:32:11 -0700 Subject: [PATCH 46/73] Reject qcow files with data-file attributes Change-Id: Ic3fa16f55acc38cf6c1a4ac1dce4487225e66d04 Closes-Bug: #2059809 (cherry picked from commit ec9c55cbbc91d1f31e42ced289a7c82cf79dc2a2) (cherry picked from commit 58d933eafb3f7164419000700a305c8f75d5cb6e) (cherry picked from commit 736328f78fb88b6d567b94b50cd14b3ebef08a5e) (cherry picked from commit af4d819c606d6662d0b086365a51f5220b596e48) --- nova/tests/unit/virt/libvirt/test_utils.py | 1 + nova/tests/unit/virt/test_images.py | 31 ++++++++++++++++++++++ nova/virt/images.py | 9 +++++++ 3 files changed, 41 insertions(+) diff --git a/nova/tests/unit/virt/libvirt/test_utils.py b/nova/tests/unit/virt/libvirt/test_utils.py index 0b80bde49fb..d91ecf047bf 100644 --- a/nova/tests/unit/virt/libvirt/test_utils.py +++ b/nova/tests/unit/virt/libvirt/test_utils.py @@ -382,6 +382,7 @@ class FakeImgInfo(object): FakeImgInfo.file_format = file_format FakeImgInfo.backing_file = backing_file FakeImgInfo.virtual_size = 1 + FakeImgInfo.format_specific = None if file_format == 'raw' else {} return FakeImgInfo() diff --git a/nova/tests/unit/virt/test_images.py b/nova/tests/unit/virt/test_images.py index 62a61c1e8b7..272a1cae362 100644 --- a/nova/tests/unit/virt/test_images.py +++ b/nova/tests/unit/virt/test_images.py @@ -112,6 +112,37 @@ def test_fetch_to_raw_errors(self, convert_image, qemu_img_info, fetch): images.fetch_to_raw, None, 'href123', '/no/path') + @mock.patch.object(images, 'convert_image', + side_effect=exception.ImageUnacceptable) + @mock.patch.object(images, 'qemu_img_info') + @mock.patch.object(images, 'fetch') + def test_fetch_to_raw_data_file(self, convert_image, qemu_img_info_fn, + fetch): + # NOTE(danms): the above test needs the following line as well, as it + # is broken without it. + qemu_img_info = qemu_img_info_fn.return_value + qemu_img_info.backing_file = None + qemu_img_info.file_format = 'qcow2' + qemu_img_info.virtual_size = 20 + qemu_img_info.format_specific = {'data': {'data-file': 'somefile'}} + self.assertRaisesRegex(exception.ImageUnacceptable, + 'Image href123 is unacceptable.*somefile', + images.fetch_to_raw, + None, 'href123', '/no/path') + + @mock.patch('os.rename') + @mock.patch.object(images, 'qemu_img_info') + @mock.patch.object(images, 'fetch') + def test_fetch_to_raw_from_raw(self, fetch, qemu_img_info_fn, mock_rename): + # Make sure we support a case where we fetch an already-raw image and + # qemu-img returns None for "format_specific". + qemu_img_info = qemu_img_info_fn.return_value + qemu_img_info.file_format = 'raw' + qemu_img_info.backing_file = None + qemu_img_info.format_specific = None + images.fetch_to_raw(None, 'href123', '/no/path') + mock_rename.assert_called_once_with('/no/path.part', '/no/path') + @mock.patch.object(compute_utils, 'disk_ops_semaphore') @mock.patch('nova.privsep.utils.supports_direct_io', return_value=True) @mock.patch('oslo_concurrency.processutils.execute') diff --git a/nova/virt/images.py b/nova/virt/images.py index f13c8722909..5f80a1d0758 100644 --- a/nova/virt/images.py +++ b/nova/virt/images.py @@ -157,6 +157,15 @@ def fetch_to_raw(context, image_href, path, trusted_certs=None): reason=(_("fmt=%(fmt)s backed by: %(backing_file)s") % {'fmt': fmt, 'backing_file': backing_file})) + try: + data_file = data.format_specific['data']['data-file'] + except (KeyError, TypeError, AttributeError): + data_file = None + if data_file is not None: + raise exception.ImageUnacceptable(image_id=image_href, + reason=(_("fmt=%(fmt)s has data-file: %(data_file)s") % + {'fmt': fmt, 'data_file': data_file})) + if fmt == 'vmdk': check_vmdk_image(image_href, data) From da352edceb74dbd715268f94516503042b48cc90 Mon Sep 17 00:00:00 2001 From: Dan Smith Date: Wed, 17 Apr 2024 07:06:13 -0700 Subject: [PATCH 47/73] Check images with format_inspector for safety It has been asserted that we should not be calling qemu-img info on untrusted files. That means we need to know if they have a backing_file, data_file or other unsafe configuration *before* we use qemu-img to probe or convert them. This grafts glance's format_inspector module into nova/images so we can use it to check the file early for safety. The expectation is that this will be moved to oslo.utils (or something) later and thus we will just delete the file from nova and change our import when that happens. NOTE: This includes whitespace changes from the glance version of format_inspector.py because of autopep8 demands. Change-Id: Iaefbe41b4c4bf0cf95d8f621653fdf65062aaa59 Closes-Bug: #2059809 (cherry picked from commit 9cdce715945619fc851ab3f43c97fab4bae4e35a) (cherry picked from commit f07fa55fd86726eeafcd4c0c687bc49dd4df9f4c) (cherry picked from commit 0acf5ee7b5dfb6ff0f9a9745f5ad2a0ed2bf65bf) (cherry picked from commit 67e5376dd64407f5aaf1ea5f8c896e356064a2c9) --- nova/conf/workarounds.py | 10 + nova/image/format_inspector.py | 889 +++++++++++++++++++++ nova/tests/unit/virt/libvirt/test_utils.py | 48 +- nova/tests/unit/virt/test_images.py | 136 +++- nova/virt/images.py | 47 +- 5 files changed, 1121 insertions(+), 9 deletions(-) create mode 100644 nova/image/format_inspector.py diff --git a/nova/conf/workarounds.py b/nova/conf/workarounds.py index e485ae673a5..924e799b620 100644 --- a/nova/conf/workarounds.py +++ b/nova/conf/workarounds.py @@ -438,6 +438,16 @@ Howerver, if you don't use automatic cleaning, it can cause an extra delay before and Ironic node is available for building a new Nova instance. +"""), + cfg.BoolOpt( + 'disable_deep_image_inspection', + default=False, + help=""" +This disables the additional deep image inspection that the compute node does +when downloading from glance. This includes backing-file, data-file, and +known-features detection *before* passing the image to qemu-img. Generally, +this inspection should be enabled for maximum safety, but this workaround +option allows disabling it if there is a compatibility concern. """), ] diff --git a/nova/image/format_inspector.py b/nova/image/format_inspector.py new file mode 100644 index 00000000000..268c98b99cb --- /dev/null +++ b/nova/image/format_inspector.py @@ -0,0 +1,889 @@ +# Copyright 2020 Red Hat, Inc +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +""" +This is a python implementation of virtual disk format inspection routines +gathered from various public specification documents, as well as qemu disk +driver code. It attempts to store and parse the minimum amount of data +required, and in a streaming-friendly manner to collect metadata about +complex-format images. +""" + +import struct + +from oslo_log import log as logging + +LOG = logging.getLogger(__name__) + + +def chunked_reader(fileobj, chunk_size=512): + while True: + chunk = fileobj.read(chunk_size) + if not chunk: + break + yield chunk + + +class CaptureRegion(object): + """Represents a region of a file we want to capture. + + A region of a file we want to capture requires a byte offset into + the file and a length. This is expected to be used by a data + processing loop, calling capture() with the most recently-read + chunk. This class handles the task of grabbing the desired region + of data across potentially multiple fractional and unaligned reads. + + :param offset: Byte offset into the file starting the region + :param length: The length of the region + """ + + def __init__(self, offset, length): + self.offset = offset + self.length = length + self.data = b'' + + @property + def complete(self): + """Returns True when we have captured the desired data.""" + return self.length == len(self.data) + + def capture(self, chunk, current_position): + """Process a chunk of data. + + This should be called for each chunk in the read loop, at least + until complete returns True. + + :param chunk: A chunk of bytes in the file + :param current_position: The position of the file processed by the + read loop so far. Note that this will be + the position in the file *after* the chunk + being presented. + """ + read_start = current_position - len(chunk) + if (read_start <= self.offset <= current_position or + self.offset <= read_start <= (self.offset + self.length)): + if read_start < self.offset: + lead_gap = self.offset - read_start + else: + lead_gap = 0 + self.data += chunk[lead_gap:] + self.data = self.data[:self.length] + + +class ImageFormatError(Exception): + """An unrecoverable image format error that aborts the process.""" + pass + + +class TraceDisabled(object): + """A logger-like thing that swallows tracing when we do not want it.""" + + def debug(self, *a, **k): + pass + + info = debug + warning = debug + error = debug + + +class FileInspector(object): + """A stream-based disk image inspector. + + This base class works on raw images and is subclassed for more + complex types. It is to be presented with the file to be examined + one chunk at a time, during read processing and will only store + as much data as necessary to determine required attributes of + the file. + """ + + def __init__(self, tracing=False): + self._total_count = 0 + + # NOTE(danms): The logging in here is extremely verbose for a reason, + # but should never really be enabled at that level at runtime. To + # retain all that work and assist in future debug, we have a separate + # debug flag that can be passed from a manual tool to turn it on. + if tracing: + self._log = logging.getLogger(str(self)) + else: + self._log = TraceDisabled() + self._capture_regions = {} + + def _capture(self, chunk, only=None): + for name, region in self._capture_regions.items(): + if only and name not in only: + continue + if not region.complete: + region.capture(chunk, self._total_count) + + def eat_chunk(self, chunk): + """Call this to present chunks of the file to the inspector.""" + pre_regions = set(self._capture_regions.keys()) + + # Increment our position-in-file counter + self._total_count += len(chunk) + + # Run through the regions we know of to see if they want this + # data + self._capture(chunk) + + # Let the format do some post-read processing of the stream + self.post_process() + + # Check to see if the post-read processing added new regions + # which may require the current chunk. + new_regions = set(self._capture_regions.keys()) - pre_regions + if new_regions: + self._capture(chunk, only=new_regions) + + def post_process(self): + """Post-read hook to process what has been read so far. + + This will be called after each chunk is read and potentially captured + by the defined regions. If any regions are defined by this call, + those regions will be presented with the current chunk in case it + is within one of the new regions. + """ + pass + + def region(self, name): + """Get a CaptureRegion by name.""" + return self._capture_regions[name] + + def new_region(self, name, region): + """Add a new CaptureRegion by name.""" + if self.has_region(name): + # This is a bug, we tried to add the same region twice + raise ImageFormatError('Inspector re-added region %s' % name) + self._capture_regions[name] = region + + def has_region(self, name): + """Returns True if named region has been defined.""" + return name in self._capture_regions + + @property + def format_match(self): + """Returns True if the file appears to be the expected format.""" + return True + + @property + def virtual_size(self): + """Returns the virtual size of the disk image, or zero if unknown.""" + return self._total_count + + @property + def actual_size(self): + """Returns the total size of the file, usually smaller than + virtual_size. NOTE: this will only be accurate if the entire + file is read and processed. + """ + return self._total_count + + @property + def complete(self): + """Returns True if we have all the information needed.""" + return all(r.complete for r in self._capture_regions.values()) + + def __str__(self): + """The string name of this file format.""" + return 'raw' + + @property + def context_info(self): + """Return info on amount of data held in memory for auditing. + + This is a dict of region:sizeinbytes items that the inspector + uses to examine the file. + """ + return {name: len(region.data) for name, region in + self._capture_regions.items()} + + @classmethod + def from_file(cls, filename): + """Read as much of a file as necessary to complete inspection. + + NOTE: Because we only read as much of the file as necessary, the + actual_size property will not reflect the size of the file, but the + amount of data we read before we satisfied the inspector. + + Raises ImageFormatError if we cannot parse the file. + """ + inspector = cls() + with open(filename, 'rb') as f: + for chunk in chunked_reader(f): + inspector.eat_chunk(chunk) + if inspector.complete: + # No need to eat any more data + break + if not inspector.complete or not inspector.format_match: + raise ImageFormatError('File is not in requested format') + return inspector + + def safety_check(self): + """Perform some checks to determine if this file is safe. + + Returns True if safe, False otherwise. It may raise ImageFormatError + if safety cannot be guaranteed because of parsing or other errors. + """ + return True + + +# The qcow2 format consists of a big-endian 72-byte header, of which +# only a small portion has information we care about: +# +# Dec Hex Name +# 0 0x00 Magic 4-bytes 'QFI\xfb' +# 4 0x04 Version (uint32_t, should always be 2 for modern files) +# . . . +# 8 0x08 Backing file offset (uint64_t) +# 24 0x18 Size in bytes (unint64_t) +# . . . +# 72 0x48 Incompatible features bitfield (6 bytes) +# +# https://gitlab.com/qemu-project/qemu/-/blob/master/docs/interop/qcow2.txt +class QcowInspector(FileInspector): + """QEMU QCOW2 Format + + This should only require about 32 bytes of the beginning of the file + to determine the virtual size, and 104 bytes to perform the safety check. + """ + + BF_OFFSET = 0x08 + BF_OFFSET_LEN = 8 + I_FEATURES = 0x48 + I_FEATURES_LEN = 8 + I_FEATURES_DATAFILE_BIT = 3 + I_FEATURES_MAX_BIT = 4 + + def __init__(self, *a, **k): + super(QcowInspector, self).__init__(*a, **k) + self.new_region('header', CaptureRegion(0, 512)) + + def _qcow_header_data(self): + magic, version, bf_offset, bf_sz, cluster_bits, size = ( + struct.unpack('>4sIQIIQ', self.region('header').data[:32])) + return magic, size + + @property + def has_header(self): + return self.region('header').complete + + @property + def virtual_size(self): + if not self.region('header').complete: + return 0 + if not self.format_match: + return 0 + magic, size = self._qcow_header_data() + return size + + @property + def format_match(self): + if not self.region('header').complete: + return False + magic, size = self._qcow_header_data() + return magic == b'QFI\xFB' + + @property + def has_backing_file(self): + if not self.region('header').complete: + return None + if not self.format_match: + return False + bf_offset_bytes = self.region('header').data[ + self.BF_OFFSET:self.BF_OFFSET + self.BF_OFFSET_LEN] + # nonzero means "has a backing file" + bf_offset, = struct.unpack('>Q', bf_offset_bytes) + return bf_offset != 0 + + @property + def has_unknown_features(self): + if not self.region('header').complete: + return None + if not self.format_match: + return False + i_features = self.region('header').data[ + self.I_FEATURES:self.I_FEATURES + self.I_FEATURES_LEN] + + # This is the maximum byte number we should expect any bits to be set + max_byte = self.I_FEATURES_MAX_BIT // 8 + + # The flag bytes are in big-endian ordering, so if we process + # them in index-order, they're reversed + for i, byte_num in enumerate(reversed(range(self.I_FEATURES_LEN))): + if byte_num == max_byte: + # If we're in the max-allowed byte, allow any bits less than + # the maximum-known feature flag bit to be set + allow_mask = ((1 << self.I_FEATURES_MAX_BIT) - 1) + elif byte_num > max_byte: + # If we're above the byte with the maximum known feature flag + # bit, then we expect all zeroes + allow_mask = 0x0 + else: + # Any earlier-than-the-maximum byte can have any of the flag + # bits set + allow_mask = 0xFF + + if i_features[i] & ~allow_mask: + LOG.warning('Found unknown feature bit in byte %i: %s/%s', + byte_num, bin(i_features[byte_num] & ~allow_mask), + bin(allow_mask)) + return True + + return False + + @property + def has_data_file(self): + if not self.region('header').complete: + return None + if not self.format_match: + return False + i_features = self.region('header').data[ + self.I_FEATURES:self.I_FEATURES + self.I_FEATURES_LEN] + + # First byte of bitfield, which is i_features[7] + byte = self.I_FEATURES_LEN - 1 - self.I_FEATURES_DATAFILE_BIT // 8 + # Third bit of bitfield, which is 0x04 + bit = 1 << (self.I_FEATURES_DATAFILE_BIT - 1 % 8) + return bool(i_features[byte] & bit) + + def __str__(self): + return 'qcow2' + + def safety_check(self): + return (not self.has_backing_file and + not self.has_data_file and + not self.has_unknown_features) + + +# The VHD (or VPC as QEMU calls it) format consists of a big-endian +# 512-byte "footer" at the beginning of the file with various +# information, most of which does not matter to us: +# +# Dec Hex Name +# 0 0x00 Magic string (8-bytes, always 'conectix') +# 40 0x28 Disk size (uint64_t) +# +# https://github.com/qemu/qemu/blob/master/block/vpc.c +class VHDInspector(FileInspector): + """Connectix/MS VPC VHD Format + + This should only require about 512 bytes of the beginning of the file + to determine the virtual size. + """ + + def __init__(self, *a, **k): + super(VHDInspector, self).__init__(*a, **k) + self.new_region('header', CaptureRegion(0, 512)) + + @property + def format_match(self): + return self.region('header').data.startswith(b'conectix') + + @property + def virtual_size(self): + if not self.region('header').complete: + return 0 + + if not self.format_match: + return 0 + + return struct.unpack('>Q', self.region('header').data[40:48])[0] + + def __str__(self): + return 'vhd' + + +# The VHDX format consists of a complex dynamic little-endian +# structure with multiple regions of metadata and data, linked by +# offsets with in the file (and within regions), identified by MSFT +# GUID strings. The header is a 320KiB structure, only a few pieces of +# which we actually need to capture and interpret: +# +# Dec Hex Name +# 0 0x00000 Identity (Technically 9-bytes, padded to 64KiB, the first +# 8 bytes of which are 'vhdxfile') +# 196608 0x30000 The Region table (64KiB of a 32-byte header, followed +# by up to 2047 36-byte region table entry structures) +# +# The region table header includes two items we need to read and parse, +# which are: +# +# 196608 0x30000 4-byte signature ('regi') +# 196616 0x30008 Entry count (uint32-t) +# +# The region table entries follow the region table header immediately +# and are identified by a 16-byte GUID, and provide an offset of the +# start of that region. We care about the "metadata region", identified +# by the METAREGION class variable. The region table entry is (offsets +# from the beginning of the entry, since it could be in multiple places): +# +# 0 0x00000 16-byte MSFT GUID +# 16 0x00010 Offset of the actual metadata region (uint64_t) +# +# When we find the METAREGION table entry, we need to grab that offset +# and start examining the region structure at that point. That +# consists of a metadata table of structures, which point to places in +# the data in an unstructured space that follows. The header is +# (offsets relative to the region start): +# +# 0 0x00000 8-byte signature ('metadata') +# . . . +# 16 0x00010 2-byte entry count (up to 2047 entries max) +# +# This header is followed by the specified number of metadata entry +# structures, identified by GUID: +# +# 0 0x00000 16-byte MSFT GUID +# 16 0x00010 4-byte offset (uint32_t, relative to the beginning of +# the metadata region) +# +# We need to find the "Virtual Disk Size" metadata item, identified by +# the GUID in the VIRTUAL_DISK_SIZE class variable, grab the offset, +# add it to the offset of the metadata region, and examine that 8-byte +# chunk of data that follows. +# +# The "Virtual Disk Size" is a naked uint64_t which contains the size +# of the virtual disk, and is our ultimate target here. +# +# https://docs.microsoft.com/en-us/openspecs/windows_protocols/ms-vhdx/83e061f8-f6e2-4de1-91bd-5d518a43d477 +class VHDXInspector(FileInspector): + """MS VHDX Format + + This requires some complex parsing of the stream. The first 256KiB + of the image is stored to get the header and region information, + and then we capture the first metadata region to read those + records, find the location of the virtual size data and parse + it. This needs to store the metadata table entries up until the + VDS record, which may consist of up to 2047 32-byte entries at + max. Finally, it must store a chunk of data at the offset of the + actual VDS uint64. + + """ + METAREGION = '8B7CA206-4790-4B9A-B8FE-575F050F886E' + VIRTUAL_DISK_SIZE = '2FA54224-CD1B-4876-B211-5DBED83BF4B8' + VHDX_METADATA_TABLE_MAX_SIZE = 32 * 2048 # From qemu + + def __init__(self, *a, **k): + super(VHDXInspector, self).__init__(*a, **k) + self.new_region('ident', CaptureRegion(0, 32)) + self.new_region('header', CaptureRegion(192 * 1024, 64 * 1024)) + + def post_process(self): + # After reading a chunk, we may have the following conditions: + # + # 1. We may have just completed the header region, and if so, + # we need to immediately read and calculate the location of + # the metadata region, as it may be starting in the same + # read we just did. + # 2. We may have just completed the metadata region, and if so, + # we need to immediately calculate the location of the + # "virtual disk size" record, as it may be starting in the + # same read we just did. + if self.region('header').complete and not self.has_region('metadata'): + region = self._find_meta_region() + if region: + self.new_region('metadata', region) + elif self.has_region('metadata') and not self.has_region('vds'): + region = self._find_meta_entry(self.VIRTUAL_DISK_SIZE) + if region: + self.new_region('vds', region) + + @property + def format_match(self): + return self.region('ident').data.startswith(b'vhdxfile') + + @staticmethod + def _guid(buf): + """Format a MSFT GUID from the 16-byte input buffer.""" + guid_format = '= 2048: + raise ImageFormatError('Region count is %i (limit 2047)' % count) + + # Process the regions until we find the metadata one; grab the + # offset and return + self._log.debug('Region entry first is %x', region_entry_first) + self._log.debug('Region entries %i', count) + meta_offset = 0 + for i in range(0, count): + entry_start = region_entry_first + (i * 32) + entry_end = entry_start + 32 + entry = self.region('header').data[entry_start:entry_end] + self._log.debug('Entry offset is %x', entry_start) + + # GUID is the first 16 bytes + guid = self._guid(entry[:16]) + if guid == self.METAREGION: + # This entry is the metadata region entry + meta_offset, meta_len, meta_req = struct.unpack( + '= 2048: + raise ImageFormatError( + 'Metadata item count is %i (limit 2047)' % count) + + for i in range(0, count): + entry_offset = 32 + (i * 32) + guid = self._guid(meta_buffer[entry_offset:entry_offset + 16]) + if guid == desired_guid: + # Found the item we are looking for by id. + # Stop our region from capturing + item_offset, item_length, _reserved = struct.unpack( + ' Date: Mon, 24 Jun 2024 09:09:36 -0700 Subject: [PATCH 48/73] Additional qemu safety checking on base images There is an additional way we can be fooled into using a qcow2 file with a data-file, which is uploading it as raw to glance and then booting an instance from it. Because when we go to create the ephemeral disk from a cached base image, we've lost the information about the original source's format, we probe the image's file type without a strict format specified. If a qcow2 file is listed in glance as a raw, we won't notice it until it is too late. This brings over another piece of code (proposed against) glance's format inspector which provides a safe format detection routine. This patch uses that to detect the format of and run a safety check on the base image each time we go to use it to create an ephemeral disk image from it. This also detects QED files and always marks them as unsafe as we do not support that format at all. Since we could be fooled into downloading one and passing it to qemu-img if we don't recognize it, we need to detect and reject it as unsafe. Conflicts: nova/tests/unit/virt/libvirt/test_utils.py nova/virt/libvirt/utils.py NOTE(elod.illes): conflicts are due to encryption support adding patch I5d6d2a7b03b5ace0826af80c4004de852579ff12 was introduced in zed. Change-Id: I4881c8cbceb30c1ff2d2b859c554e0d02043f1f5 (cherry picked from commit b1b88bf001757546fbbea959f4b73cb344407dfb) (cherry picked from commit 8a0d5f2afaf40c4554419a0b2488ce092eda7a1a) (cherry picked from commit 0269234dc42fe2c320dc4696123cf5132642f9b7) (cherry picked from commit 9e10ac25490e7b5353cb01e768d22eb5a1f92825) --- nova/image/format_inspector.py | 70 ++++++++++++++++--- nova/tests/unit/virt/libvirt/test_driver.py | 7 +- .../unit/virt/libvirt/test_imagebackend.py | 45 ++++++++++-- nova/tests/unit/virt/libvirt/test_utils.py | 39 ++++++++++- nova/virt/libvirt/imagebackend.py | 15 ++++ nova/virt/libvirt/utils.py | 28 ++++++++ 6 files changed, 185 insertions(+), 19 deletions(-) diff --git a/nova/image/format_inspector.py b/nova/image/format_inspector.py index 268c98b99cb..8e57d7ed2c4 100644 --- a/nova/image/format_inspector.py +++ b/nova/image/format_inspector.py @@ -368,6 +368,23 @@ def safety_check(self): not self.has_unknown_features) +class QEDInspector(FileInspector): + def __init__(self, tracing=False): + super().__init__(tracing) + self.new_region('header', CaptureRegion(0, 512)) + + @property + def format_match(self): + if not self.region('header').complete: + return False + return self.region('header').data.startswith(b'QED\x00') + + def safety_check(self): + # QED format is not supported by anyone, but we want to detect it + # and mark it as just always unsafe. + return False + + # The VHD (or VPC as QEMU calls it) format consists of a big-endian # 512-byte "footer" at the beginning of the file with various # information, most of which does not matter to us: @@ -871,19 +888,52 @@ def close(self): self._source.close() +ALL_FORMATS = { + 'raw': FileInspector, + 'qcow2': QcowInspector, + 'vhd': VHDInspector, + 'vhdx': VHDXInspector, + 'vmdk': VMDKInspector, + 'vdi': VDIInspector, + 'qed': QEDInspector, +} + + def get_inspector(format_name): """Returns a FormatInspector class based on the given name. :param format_name: The name of the disk_format (raw, qcow2, etc). :returns: A FormatInspector or None if unsupported. """ - formats = { - 'raw': FileInspector, - 'qcow2': QcowInspector, - 'vhd': VHDInspector, - 'vhdx': VHDXInspector, - 'vmdk': VMDKInspector, - 'vdi': VDIInspector, - } - - return formats.get(format_name) + + return ALL_FORMATS.get(format_name) + + +def detect_file_format(filename): + """Attempts to detect the format of a file. + + This runs through a file one time, running all the known inspectors in + parallel. It stops reading the file once one of them matches or all of + them are sure they don't match. + + Returns the FileInspector that matched, if any. None if 'raw'. + """ + inspectors = {k: v() for k, v in ALL_FORMATS.items()} + with open(filename, 'rb') as f: + for chunk in chunked_reader(f): + for format, inspector in list(inspectors.items()): + try: + inspector.eat_chunk(chunk) + except ImageFormatError: + # No match, so stop considering this format + inspectors.pop(format) + continue + if (inspector.format_match and inspector.complete and + format != 'raw'): + # First complete match (other than raw) wins + return inspector + if all(i.complete for i in inspectors.values()): + # If all the inspectors are sure they are not a match, avoid + # reading to the end of the file to settle on 'raw'. + break + return inspectors['raw'] diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index 36fdae65d9d..a34a20619f0 100644 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -14275,10 +14275,11 @@ def test_create_images_and_backing_images_exist( '/fake/instance/dir', disk_info) self.assertFalse(mock_fetch_image.called) + @mock.patch('nova.image.format_inspector.detect_file_format') @mock.patch('nova.privsep.path.utime') @mock.patch('nova.virt.libvirt.utils.create_image') def test_create_images_and_backing_ephemeral_gets_created( - self, mock_create_cow_image, mock_utime): + self, mock_create_cow_image, mock_utime, mock_detect): drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) base_dir = os.path.join(CONF.instances_path, @@ -16018,11 +16019,13 @@ def test_create_ephemeral_specified_fs(self, fake_mkfs): fake_mkfs.assert_has_calls([mock.call('ext4', '/dev/something', 'myVol')]) + @mock.patch('nova.image.format_inspector.detect_file_format') @mock.patch('nova.privsep.path.utime') @mock.patch('nova.virt.libvirt.utils.fetch_image') @mock.patch('nova.virt.libvirt.utils.create_image') def test_create_ephemeral_specified_fs_not_valid( - self, mock_create_cow_image, mock_fetch_image, mock_utime): + self, mock_create_cow_image, mock_fetch_image, mock_utime, + mock_detect): CONF.set_override('default_ephemeral_format', 'ext4') ephemerals = [{'device_type': 'disk', 'disk_bus': 'virtio', diff --git a/nova/tests/unit/virt/libvirt/test_imagebackend.py b/nova/tests/unit/virt/libvirt/test_imagebackend.py index 0dc1009c920..853c5a200c8 100644 --- a/nova/tests/unit/virt/libvirt/test_imagebackend.py +++ b/nova/tests/unit/virt/libvirt/test_imagebackend.py @@ -524,13 +524,15 @@ def test_cache_template_exists(self, mock_exists): mock_exists.assert_has_calls(exist_calls) + @mock.patch('nova.image.format_inspector.detect_file_format') @mock.patch.object(imagebackend.utils, 'synchronized') @mock.patch('nova.virt.libvirt.utils.create_image') @mock.patch.object(os.path, 'exists', side_effect=[]) @mock.patch.object(imagebackend.Image, 'verify_base_size') @mock.patch('nova.privsep.path.utime') def test_create_image( - self, mock_utime, mock_verify, mock_exist, mock_create, mock_sync + self, mock_utime, mock_verify, mock_exist, mock_create, mock_sync, + mock_detect_format ): mock_sync.side_effect = lambda *a, **kw: self._fake_deco fn = mock.MagicMock() @@ -551,7 +553,10 @@ def test_create_image( mock_exist.assert_has_calls(exist_calls) self.assertTrue(mock_sync.called) mock_utime.assert_called() + mock_detect_format.assert_called_once() + mock_detect_format.return_value.safety_check.assert_called_once_with() + @mock.patch('nova.image.format_inspector.detect_file_format') @mock.patch.object(imagebackend.utils, 'synchronized') @mock.patch('nova.virt.libvirt.utils.create_image') @mock.patch.object(imagebackend.disk, 'extend') @@ -559,7 +564,8 @@ def test_create_image( @mock.patch.object(imagebackend.Qcow2, 'get_disk_size') @mock.patch('nova.privsep.path.utime') def test_create_image_too_small(self, mock_utime, mock_get, mock_exist, - mock_extend, mock_create, mock_sync): + mock_extend, mock_create, mock_sync, + mock_detect_format): mock_sync.side_effect = lambda *a, **kw: self._fake_deco mock_get.return_value = self.SIZE fn = mock.MagicMock() @@ -576,7 +582,9 @@ def test_create_image_too_small(self, mock_utime, mock_get, mock_exist, self.assertTrue(mock_sync.called) self.assertFalse(mock_create.called) self.assertFalse(mock_extend.called) + mock_detect_format.assert_called_once() + @mock.patch('nova.image.format_inspector.detect_file_format') @mock.patch.object(imagebackend.utils, 'synchronized') @mock.patch('nova.virt.libvirt.utils.create_image') @mock.patch('nova.virt.libvirt.utils.get_disk_backing_file') @@ -588,7 +596,8 @@ def test_create_image_too_small(self, mock_utime, mock_get, mock_exist, def test_generate_resized_backing_files(self, mock_utime, mock_copy, mock_verify, mock_exist, mock_extend, mock_get, - mock_create, mock_sync): + mock_create, mock_sync, + mock_detect_format): mock_sync.side_effect = lambda *a, **kw: self._fake_deco mock_get.return_value = self.QCOW2_BASE fn = mock.MagicMock() @@ -615,7 +624,9 @@ def test_generate_resized_backing_files(self, mock_utime, mock_copy, self.assertTrue(mock_sync.called) self.assertFalse(mock_create.called) mock_utime.assert_called() + mock_detect_format.assert_called_once() + @mock.patch('nova.image.format_inspector.detect_file_format') @mock.patch.object(imagebackend.utils, 'synchronized') @mock.patch('nova.virt.libvirt.utils.create_image') @mock.patch('nova.virt.libvirt.utils.get_disk_backing_file') @@ -626,7 +637,8 @@ def test_generate_resized_backing_files(self, mock_utime, mock_copy, def test_qcow2_exists_and_has_no_backing_file(self, mock_utime, mock_verify, mock_exist, mock_extend, mock_get, - mock_create, mock_sync): + mock_create, mock_sync, + mock_detect_format): mock_sync.side_effect = lambda *a, **kw: self._fake_deco mock_get.return_value = None fn = mock.MagicMock() @@ -647,6 +659,31 @@ def test_qcow2_exists_and_has_no_backing_file(self, mock_utime, self.assertTrue(mock_sync.called) self.assertFalse(mock_create.called) self.assertFalse(mock_extend.called) + mock_detect_format.assert_called_once() + + @mock.patch('nova.image.format_inspector.detect_file_format') + @mock.patch.object(imagebackend.utils, 'synchronized') + @mock.patch('nova.virt.libvirt.utils.create_image') + @mock.patch('nova.virt.libvirt.utils.get_disk_backing_file') + @mock.patch.object(imagebackend.disk, 'extend') + @mock.patch.object(os.path, 'exists', side_effect=[]) + @mock.patch.object(imagebackend.Image, 'verify_base_size') + def test_qcow2_exists_and_fails_safety_check(self, + mock_verify, mock_exist, + mock_extend, mock_get, + mock_create, mock_sync, + mock_detect_format): + mock_detect_format.return_value.safety_check.return_value = False + mock_sync.side_effect = lambda *a, **kw: self._fake_deco + mock_get.return_value = None + fn = mock.MagicMock() + mock_exist.side_effect = [False, True, False, True, True] + image = self.image_class(self.INSTANCE, self.NAME) + + self.assertRaises(exception.InvalidDiskInfo, + image.create_image, fn, self.TEMPLATE_PATH, + self.SIZE) + mock_verify.assert_not_called() def test_resolve_driver_format(self): image = self.image_class(self.INSTANCE, self.NAME) diff --git a/nova/tests/unit/virt/libvirt/test_utils.py b/nova/tests/unit/virt/libvirt/test_utils.py index 68e24b7b6fc..0ab00717d28 100644 --- a/nova/tests/unit/virt/libvirt/test_utils.py +++ b/nova/tests/unit/virt/libvirt/test_utils.py @@ -106,15 +106,27 @@ def test_valid_hostname_bad(self): @mock.patch('oslo_concurrency.processutils.execute') @mock.patch('nova.virt.images.qemu_img_info') + @mock.patch('nova.image.format_inspector.detect_file_format') def _test_create_image( - self, path, disk_format, disk_size, mock_info, mock_execute, - backing_file=None + self, path, disk_format, disk_size, mock_detect, mock_info, + mock_execute, backing_file=None, safety_check=True ): + if isinstance(backing_file, dict): + backing_info = backing_file + backing_file = backing_info.pop('file', None) + else: + backing_info = {} + backing_backing_file = backing_info.pop('backing_file', None) + mock_info.return_value = mock.Mock( file_format=mock.sentinel.backing_fmt, cluster_size=mock.sentinel.cluster_size, + backing_file=backing_backing_file, + format_specific=backing_info, ) + mock_detect.return_value.safety_check.return_value = safety_check + libvirt_utils.create_image( path, disk_format, disk_size, backing_file=backing_file) @@ -126,7 +138,7 @@ def _test_create_image( mock_info.assert_called_once_with(backing_file) cow_opts = [ '-o', - f'backing_file={mock.sentinel.backing_file},' + f'backing_file={backing_file},' f'backing_fmt={mock.sentinel.backing_fmt},' f'cluster_size={mock.sentinel.cluster_size}', ] @@ -139,6 +151,8 @@ def _test_create_image( expected_args += (disk_size,) self.assertEqual([(expected_args,)], mock_execute.call_args_list) + if backing_file: + mock_detect.return_value.safety_check.assert_called_once_with() def test_create_image_raw(self): self._test_create_image('/some/path', 'raw', '10G') @@ -154,6 +168,25 @@ def test_create_image_backing_file(self): backing_file=mock.sentinel.backing_file, ) + def test_create_image_base_has_backing_file(self): + self.assertRaises( + exception.InvalidDiskInfo, + self._test_create_image, + '/some/stuff', 'qcow2', '1234567891234', + backing_file={'file': mock.sentinel.backing_file, + 'backing_file': mock.sentinel.backing_backing_file}, + ) + + def test_create_image_base_has_data_file(self): + self.assertRaises( + exception.InvalidDiskInfo, + self._test_create_image, + '/some/stuff', 'qcow2', '1234567891234', + backing_file={'file': mock.sentinel.backing_file, + 'backing_file': mock.sentinel.backing_backing_file, + 'data': {'data-file': mock.sentinel.data_file}}, + ) + def test_create_image_size_none(self): self._test_create_image( '/some/stuff', 'qcow2', None, diff --git a/nova/virt/libvirt/imagebackend.py b/nova/virt/libvirt/imagebackend.py index 534cc60759b..e9e62095680 100644 --- a/nova/virt/libvirt/imagebackend.py +++ b/nova/virt/libvirt/imagebackend.py @@ -34,6 +34,7 @@ import nova.conf from nova import exception from nova.i18n import _ +from nova.image import format_inspector from nova.image import glance import nova.privsep.libvirt import nova.privsep.path @@ -660,6 +661,20 @@ def create_qcow2_image(base, target, size): if not os.path.exists(base): prepare_template(target=base, *args, **kwargs) + # NOTE(danms): We need to perform safety checks on the base image + # before we inspect it for other attributes. We do this each time + # because additional safety checks could have been added since we + # downloaded the image. + if not CONF.workarounds.disable_deep_image_inspection: + inspector = format_inspector.detect_file_format(base) + if not inspector.safety_check(): + LOG.warning('Base image %s failed safety check', base) + # NOTE(danms): This is the same exception as would be raised + # by qemu_img_info() if the disk format was unreadable or + # otherwise unsuitable. + raise exception.InvalidDiskInfo( + reason=_('Base image failed safety check')) + # NOTE(ankit): Update the mtime of the base file so the image # cache manager knows it is in use. _update_utime_ignore_eacces(base) diff --git a/nova/virt/libvirt/utils.py b/nova/virt/libvirt/utils.py index 0675e4ac146..7637930b6a6 100644 --- a/nova/virt/libvirt/utils.py +++ b/nova/virt/libvirt/utils.py @@ -34,6 +34,7 @@ from nova import context as nova_context from nova import exception from nova.i18n import _ +from nova.image import format_inspector from nova import objects from nova.objects import fields as obj_fields import nova.privsep.fs @@ -132,7 +133,34 @@ def create_image( cow_opts = [] if backing_file: + # NOTE(danms): We need to perform safety checks on the base image + # before we inspect it for other attributes. We do this each time + # because additional safety checks could have been added since we + # downloaded the image. + if not CONF.workarounds.disable_deep_image_inspection: + inspector = format_inspector.detect_file_format(backing_file) + if not inspector.safety_check(): + LOG.warning('Base image %s failed safety check', backing_file) + # NOTE(danms): This is the same exception as would be raised + # by qemu_img_info() if the disk format was unreadable or + # otherwise unsuitable. + raise exception.InvalidDiskInfo( + reason=_('Base image failed safety check')) + base_details = images.qemu_img_info(backing_file) + if base_details.backing_file is not None: + LOG.warning('Base image %s failed safety check', backing_file) + raise exception.InvalidDiskInfo( + reason=_('Base image failed safety check')) + try: + data_file = base_details.format_specific['data']['data-file'] + except (KeyError, TypeError, AttributeError): + data_file = None + if data_file is not None: + LOG.warning('Base image %s failed safety check', backing_file) + raise exception.InvalidDiskInfo( + reason=_('Base image failed safety check')) + cow_opts += [ f'backing_file={backing_file}', f'backing_fmt={base_details.file_format}' From a2acb31d790e6cb41c067bfc0343bde274c9428c Mon Sep 17 00:00:00 2001 From: Dan Smith Date: Mon, 1 Jul 2024 09:06:40 -0700 Subject: [PATCH 49/73] Fix vmdk_allowed_types checking This restores the vmdk_allowed_types checking in create_image() that was unintentionally lost by tightening the qemu-type-matches-glance code in the fetch patch recently. Since we are still detecting the format of base images without metadata, we would have treated a vmdk file that claims to be raw as raw in fetch, but then read it like a vmdk once it was used as a base image for something else. Conflicts: nova/tests/unit/virt/libvirt/test_utils.py NOTE(elod.illes): conflicts are due to encryption support adding patch I5d6d2a7b03b5ace0826af80c4004de852579ff12 was introduced in zed. Change-Id: I07b332a7edb814f6a91661651d9d24bfd6651ae7 Related-Bug: #2059809 (cherry picked from commit 08be7b2a0dc1d7728d8034bc2aab0428c4fb642e) (cherry picked from commit 11301e7e3f0d81a3368632f90608e30d9c647111) (cherry picked from commit 70a435fd519a0ebcc3ac9ad5254fefbf19c93e48) (cherry picked from commit f732f8476851e6272d8ad9937f54b918795844e8) --- nova/tests/unit/virt/libvirt/test_utils.py | 25 ++++++++++++++++++++-- nova/virt/libvirt/utils.py | 2 ++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/nova/tests/unit/virt/libvirt/test_utils.py b/nova/tests/unit/virt/libvirt/test_utils.py index 0ab00717d28..26de82790d9 100644 --- a/nova/tests/unit/virt/libvirt/test_utils.py +++ b/nova/tests/unit/virt/libvirt/test_utils.py @@ -117,9 +117,11 @@ def _test_create_image( else: backing_info = {} backing_backing_file = backing_info.pop('backing_file', None) + backing_fmt = backing_info.pop('backing_fmt', + mock.sentinel.backing_fmt) mock_info.return_value = mock.Mock( - file_format=mock.sentinel.backing_fmt, + file_format=backing_fmt, cluster_size=mock.sentinel.cluster_size, backing_file=backing_backing_file, format_specific=backing_info, @@ -139,7 +141,7 @@ def _test_create_image( cow_opts = [ '-o', f'backing_file={backing_file},' - f'backing_fmt={mock.sentinel.backing_fmt},' + f'backing_fmt={backing_fmt},' f'cluster_size={mock.sentinel.cluster_size}', ] @@ -193,6 +195,25 @@ def test_create_image_size_none(self): backing_file=mock.sentinel.backing_file, ) + def test_create_image_vmdk(self): + self._test_create_image( + '/some/vmdk', 'vmdk', '1234567891234', + backing_file={'file': mock.sentinel.backing_file, + 'backing_fmt': 'vmdk', + 'backing_file': None, + 'data': {'create-type': 'monolithicSparse'}} + ) + + def test_create_image_vmdk_invalid_type(self): + self.assertRaises(exception.ImageUnacceptable, + self._test_create_image, + '/some/vmdk', 'vmdk', '1234567891234', + backing_file={'file': mock.sentinel.backing_file, + 'backing_fmt': 'vmdk', + 'backing_file': None, + 'data': {'create-type': 'monolithicFlat'}} + ) + @ddt.unpack @ddt.data({'fs_type': 'some_fs_type', 'default_eph_format': None, diff --git a/nova/virt/libvirt/utils.py b/nova/virt/libvirt/utils.py index 7637930b6a6..6dcc928a28d 100644 --- a/nova/virt/libvirt/utils.py +++ b/nova/virt/libvirt/utils.py @@ -148,6 +148,8 @@ def create_image( reason=_('Base image failed safety check')) base_details = images.qemu_img_info(backing_file) + if base_details.file_format == 'vmdk': + images.check_vmdk_image('base', base_details) if base_details.backing_file is not None: LOG.warning('Base image %s failed safety check', backing_file) raise exception.InvalidDiskInfo( From 1be000939585270e9241b860a1d84e8cc209654a Mon Sep 17 00:00:00 2001 From: Elod Illes Date: Wed, 10 Jul 2024 15:51:26 +0200 Subject: [PATCH 50/73] [CI][zed-only] Use zed-last version of tempest in nova-next And exclude a flaky test case[1] as well to make the gate more stable. [1] test_instances_with_cinder_volumes_on_all_compute_nodes Change-Id: I3c69a20993bca066ece46eed16d4e897144524cd --- .zuul.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.zuul.yaml b/.zuul.yaml index 550332abc7d..a46c6fce662 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -339,6 +339,8 @@ post-run: playbooks/nova-next/post.yaml required-projects: - novnc/novnc + - name: openstack/tempest + override-checkout: zed-last vars: # We use the "all" environment for tempest_test_regex and # tempest_exclude_regex. @@ -352,7 +354,7 @@ # tempest_test_exclude_list. # FIXME(lyarwood): The tempest.api.compute.admin.test_volume_swap tests # are skipped until bug #1929710 is resolved. - tempest_exclude_regex: ^tempest\.(scenario\.test_network_(?!qos)|api\.compute\.admin\.test_volume_swap)|tempest.api.compute.servers.test_device_tagging.TaggedAttachmentsTest.test_tagged_attachment + tempest_exclude_regex: ^tempest\.(scenario\.test_network_(?!qos)|api\.compute\.admin\.test_volume_swap)|tempest.api.compute.servers.test_device_tagging.TaggedAttachmentsTest.test_tagged_attachment|test_instances_with_cinder_volumes_on_all_compute_nodes devstack_local_conf: post-config: $NOVA_CPU_CONF: From 3a7e58b06a2a89dbc4d4644c9c63bab28325ecab Mon Sep 17 00:00:00 2001 From: Elod Illes Date: Mon, 8 Jul 2024 10:51:39 +0200 Subject: [PATCH 51/73] [CI][zed-only] Ceph minimum client on cinder-plugin-ceph-tempest job enable Since e222cc976918a331bacff150e84069fda8f4960a, it is possible to set the minimum client version. The goal of this patch is to enable the *mimic* client version for the current cinder-plugin-ceph-tempest job. As a result, we will be able to ensure that snapshots can be deleted when a volume is cloned from them. So that we can reduce the excluded test cases while keeping the gate functional. Change-Id: I441d0513a6547b2fbae011b7e9dad7d6a51398a6 --- .zuul.yaml | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/.zuul.yaml b/.zuul.yaml index a46c6fce662..15e44f81fe7 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -598,16 +598,9 @@ pre-run: - playbooks/ceph/glance-copy-policy.yaml vars: - # NOTE(elod.illes): this job started to break with the following five - # test cases, somewhere around merging cinder-tempest-plugin patch - # I281f881ad565e565839522ddf02057f7545c7146 so let's just exclude - # them to unblock the gate. - tempest_exclude_regex: "\ - (test_delete_dep_chain)|\ - (test_delete_dep_chain_2)|\ - (test_delete_source_snapshot)|\ - (test_delete_source_volume)|\ - (test_nova_image_snapshot_dependency)" + # NOTE(elod.illes): this job is breaking with the following test case on + # unmaintained/yoga, so let's just exclude it to unblock the gate + tempest_exclude_regex: test_nova_image_snapshot_dependency # NOTE(danms): These tests create an empty non-raw image, which nova # will refuse because we set never_download_image_if_on_rbd in this job. # Just skip these tests for this case. @@ -615,6 +608,7 @@ GLANCE_STANDALONE: True GLANCE_USE_IMPORT_WORKFLOW: True DEVSTACK_PARALLEL: True + CEPH_MIN_CLIENT_VERSION: "mimic" # NOTE(danms): This job is pretty heavy as it is, so we disable some # services that are not relevant to the nova-glance-ceph scenario # that this job is intended to validate. From d7e3d722cd6c59968cbfe1d7a3bd7021c90165e5 Mon Sep 17 00:00:00 2001 From: Sean Mooney Date: Thu, 4 Jul 2024 12:38:39 +0100 Subject: [PATCH 52/73] port format inspector tests from glance This commit is a direct port of the format inspector unit tests from glance as of commit 0d8e79b713bc31a78f0f4eac14ee594ca8520999 the only changes to the test are as follows "from glance.common import format_inspector" was updated to "from nova.image import format_inspector" "from glance.tests import utils as test_utils" was replaced with "from nova import test" "test_utils.BaseTestCase" was replaced with "test.NoDBTestCase" "glance-unittest-formatinspector-" was replaced with "nova-unittest-formatinspector-" This makes the test funtional in nova. TestFormatInspectors requries qemu-img to be installed on the host which would be a new depency for executing unit tests. to avoid that we skip TestFormatInspectors if qemu-img is not installed. TestFormatInspectorInfra and TestFormatInspectorsTargeted do not have a qemu-img dependency so no changes to the test assertions were required. Change-Id: Ia34203f246f0bc574e11476287dfb33fda7954fe (cherry picked from commit 838daa3cad5fb3cdd10fb7aa76c647330a66939e) (cherry picked from commit 66205be426028f8b7d16163ca6901bc181d703b6) (cherry picked from commit 497abea5a189cc7043766273e9d17571f722190a) (cherry picked from commit 58cd955c7d4848ed8da71f3c0352a5303cae6200) --- .../tests/unit/image/test_format_inspector.py | 517 ++++++++++++++++++ 1 file changed, 517 insertions(+) create mode 100644 nova/tests/unit/image/test_format_inspector.py diff --git a/nova/tests/unit/image/test_format_inspector.py b/nova/tests/unit/image/test_format_inspector.py new file mode 100644 index 00000000000..4bda796ea42 --- /dev/null +++ b/nova/tests/unit/image/test_format_inspector.py @@ -0,0 +1,517 @@ +# Copyright 2020 Red Hat, Inc +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import io +import os +import re +import struct +import subprocess +import tempfile +from unittest import mock + +from oslo_utils import units + +from nova.image import format_inspector +from nova import test + + +def get_size_from_qemu_img(filename): + output = subprocess.check_output('qemu-img info "%s"' % filename, + shell=True) + for line in output.split(b'\n'): + m = re.search(b'^virtual size: .* .([0-9]+) bytes', line.strip()) + if m: + return int(m.group(1)) + + raise Exception('Could not find virtual size with qemu-img') + + +class TestFormatInspectors(test.NoDBTestCase): + def setUp(self): + super(TestFormatInspectors, self).setUp() + # these tests depend on qemu-img being installed + # and in the path, if it is not installed, skip + try: + subprocess.check_output('qemu-img --version', shell=True) + except Exception: + self.skipTest('qemu-img not installed') + + self._created_files = [] + + def tearDown(self): + super(TestFormatInspectors, self).tearDown() + for fn in self._created_files: + try: + os.remove(fn) + except Exception: + pass + + def _create_img(self, fmt, size, subformat=None, options=None, + backing_file=None): + if fmt == 'vhd': + # QEMU calls the vhd format vpc + fmt = 'vpc' + + if options is None: + options = {} + opt = '' + prefix = 'nova-unittest-formatinspector-' + + if subformat: + options['subformat'] = subformat + prefix += subformat + '-' + + if options: + opt += '-o ' + ','.join('%s=%s' % (k, v) + for k, v in options.items()) + + if backing_file is not None: + opt += ' -b %s -F raw' % backing_file + + fn = tempfile.mktemp(prefix=prefix, + suffix='.%s' % fmt) + self._created_files.append(fn) + subprocess.check_output( + 'qemu-img create -f %s %s %s %i' % (fmt, opt, fn, size), + shell=True) + return fn + + def _create_allocated_vmdk(self, size_mb, subformat=None): + # We need a "big" VMDK file to exercise some parts of the code of the + # format_inspector. A way to create one is to first create an empty + # file, and then to convert it with the -S 0 option. + + if subformat is None: + # Matches qemu-img default, see `qemu-img convert -O vmdk -o help` + subformat = 'monolithicSparse' + + prefix = 'nova-unittest-formatinspector-%s-' % subformat + fn = tempfile.mktemp(prefix=prefix, suffix='.vmdk') + self._created_files.append(fn) + raw = tempfile.mktemp(prefix=prefix, suffix='.raw') + self._created_files.append(raw) + + # Create a file with pseudo-random data, otherwise it will get + # compressed in the streamOptimized format + subprocess.check_output( + 'dd if=/dev/urandom of=%s bs=1M count=%i' % (raw, size_mb), + shell=True) + + # Convert it to VMDK + subprocess.check_output( + 'qemu-img convert -f raw -O vmdk -o subformat=%s -S 0 %s %s' % ( + subformat, raw, fn), + shell=True) + return fn + + def _test_format_at_block_size(self, format_name, img, block_size): + fmt = format_inspector.get_inspector(format_name)() + self.assertIsNotNone(fmt, + 'Did not get format inspector for %s' % ( + format_name)) + wrapper = format_inspector.InfoWrapper(open(img, 'rb'), fmt) + + while True: + chunk = wrapper.read(block_size) + if not chunk: + break + + wrapper.close() + return fmt + + def _test_format_at_image_size(self, format_name, image_size, + subformat=None): + img = self._create_img(format_name, image_size, subformat=subformat) + + # Some formats have internal alignment restrictions making this not + # always exactly like image_size, so get the real value for comparison + virtual_size = get_size_from_qemu_img(img) + + # Read the format in various sizes, some of which will read whole + # sections in a single read, others will be completely unaligned, etc. + for block_size in (64 * units.Ki, 512, 17, 1 * units.Mi): + fmt = self._test_format_at_block_size(format_name, img, block_size) + self.assertTrue(fmt.format_match, + 'Failed to match %s at size %i block %i' % ( + format_name, image_size, block_size)) + self.assertEqual(virtual_size, fmt.virtual_size, + ('Failed to calculate size for %s at size %i ' + 'block %i') % (format_name, image_size, + block_size)) + memory = sum(fmt.context_info.values()) + self.assertLess(memory, 512 * units.Ki, + 'Format used more than 512KiB of memory: %s' % ( + fmt.context_info)) + + def _test_format(self, format_name, subformat=None): + # Try a few different image sizes, including some odd and very small + # sizes + for image_size in (512, 513, 2057, 7): + self._test_format_at_image_size(format_name, image_size * units.Mi, + subformat=subformat) + + def test_qcow2(self): + self._test_format('qcow2') + + def test_vhd(self): + self._test_format('vhd') + + def test_vhdx(self): + self._test_format('vhdx') + + def test_vmdk(self): + self._test_format('vmdk') + + def test_vmdk_stream_optimized(self): + self._test_format('vmdk', 'streamOptimized') + + def test_from_file_reads_minimum(self): + img = self._create_img('qcow2', 10 * units.Mi) + file_size = os.stat(img).st_size + fmt = format_inspector.QcowInspector.from_file(img) + # We know everything we need from the first 512 bytes of a QCOW image, + # so make sure that we did not read the whole thing when we inspect + # a local file. + self.assertLess(fmt.actual_size, file_size) + + def test_qed_always_unsafe(self): + img = self._create_img('qed', 10 * units.Mi) + fmt = format_inspector.get_inspector('qed').from_file(img) + self.assertTrue(fmt.format_match) + self.assertFalse(fmt.safety_check()) + + def _test_vmdk_bad_descriptor_offset(self, subformat=None): + format_name = 'vmdk' + image_size = 10 * units.Mi + descriptorOffsetAddr = 0x1c + BAD_ADDRESS = 0x400 + img = self._create_img(format_name, image_size, subformat=subformat) + + # Corrupt the header + fd = open(img, 'r+b') + fd.seek(descriptorOffsetAddr) + fd.write(struct.pack(' Date: Thu, 4 Jul 2024 13:55:41 +0100 Subject: [PATCH 53/73] Reproduce iso regression with deep format inspection This change adds a reproducer for the regression in iso file support when workarounds.disable_deep_image_inspection = False Change-Id: I56d8b9980b4871941ba5de91e60a7df6a40106a8 (cherry picked from commit b5a1d3b4b2d0aaa351479b1d7e41a3895c28fab0) (cherry picked from commit 3a6d9a038fad2bd58bdf4fb87af04158301a6929) (cherry picked from commit 000b435a44e905122a45d3b137a576c60bf42a58) (cherry picked from commit 1233d7b935c018e79728c5691216fa2569affe08) --- .../tests/unit/image/test_format_inspector.py | 72 ++++++++++++++++--- 1 file changed, 63 insertions(+), 9 deletions(-) diff --git a/nova/tests/unit/image/test_format_inspector.py b/nova/tests/unit/image/test_format_inspector.py index 4bda796ea42..9bd99c03cca 100644 --- a/nova/tests/unit/image/test_format_inspector.py +++ b/nova/tests/unit/image/test_format_inspector.py @@ -27,6 +27,9 @@ from nova import test +TEST_IMAGE_PREFIX = 'nova-unittest-formatinspector-' + + def get_size_from_qemu_img(filename): output = subprocess.check_output('qemu-img info "%s"' % filename, shell=True) @@ -41,13 +44,6 @@ def get_size_from_qemu_img(filename): class TestFormatInspectors(test.NoDBTestCase): def setUp(self): super(TestFormatInspectors, self).setUp() - # these tests depend on qemu-img being installed - # and in the path, if it is not installed, skip - try: - subprocess.check_output('qemu-img --version', shell=True) - except Exception: - self.skipTest('qemu-img not installed') - self._created_files = [] def tearDown(self): @@ -58,8 +54,55 @@ def tearDown(self): except Exception: pass + def _create_iso(self, image_size, subformat='iso-9660'): + # these tests depend on mkisofs + # being installed and in the path, + # if it is not installed, skip + try: + subprocess.check_output('mkisofs --version', shell=True) + except Exception: + self.skipTest('mkisofs not installed') + + size = image_size // units.Mi + base_cmd = "mkisofs" + if subformat == 'udf': + # depending on the distribution mkisofs may not support udf + # and may be provided by genisoimage instead. As a result we + # need to check if the command supports udf via help + # instead of checking the installed version. + # mkisofs --help outputs to stderr so we need to + # redirect it to stdout to use grep. + try: + subprocess.check_output( + 'mkisofs --help 2>&1 | grep udf', shell=True) + except Exception: + self.skipTest('mkisofs does not support udf format') + base_cmd += " -udf" + prefix = TEST_IMAGE_PREFIX + prefix += '-%s-' % subformat + fn = tempfile.mktemp(prefix=prefix, suffix='.iso') + self._created_files.append(fn) + subprocess.check_output( + 'dd if=/dev/zero of=%s bs=1M count=%i' % (fn, size), + shell=True) + subprocess.check_output( + '%s -o %s -V "TEST" -J -r %s' % (base_cmd, fn, fn), + shell=True) + return fn + def _create_img(self, fmt, size, subformat=None, options=None, backing_file=None): + if fmt == 'iso': + return self._create_iso(size, subformat) + + # these tests depend on qemu-img + # being installed and in the path, + # if it is not installed, skip + try: + subprocess.check_output('qemu-img --version', shell=True) + except Exception: + self.skipTest('qemu-img not installed') + if fmt == 'vhd': # QEMU calls the vhd format vpc fmt = 'vpc' @@ -67,7 +110,7 @@ def _create_img(self, fmt, size, subformat=None, options=None, if options is None: options = {} opt = '' - prefix = 'nova-unittest-formatinspector-' + prefix = TEST_IMAGE_PREFIX if subformat: options['subformat'] = subformat @@ -97,7 +140,8 @@ def _create_allocated_vmdk(self, size_mb, subformat=None): # Matches qemu-img default, see `qemu-img convert -O vmdk -o help` subformat = 'monolithicSparse' - prefix = 'nova-unittest-formatinspector-%s-' % subformat + prefix = TEST_IMAGE_PREFIX + prefix += '-%s-' % subformat fn = tempfile.mktemp(prefix=prefix, suffix='.vmdk') self._created_files.append(fn) raw = tempfile.mktemp(prefix=prefix, suffix='.raw') @@ -165,6 +209,16 @@ def _test_format(self, format_name, subformat=None): def test_qcow2(self): self._test_format('qcow2') + def test_iso_9660(self): + # reproduce iso-9660 format regression + self.assertRaises( + TypeError, self._test_format, 'iso', subformat='iso-9660') + + def test_udf(self): + # reproduce udf format regression + self.assertRaises( + TypeError, self._test_format, 'iso', subformat='udf') + def test_vhd(self): self._test_format('vhd') From e8f00617ed319aa37f6946cf10883eef6d180612 Mon Sep 17 00:00:00 2001 From: Sean Mooney Date: Thu, 4 Jul 2024 20:09:31 +0100 Subject: [PATCH 54/73] Add iso file format inspector This change includes unit tests for the ISO format inspector using mkisofs to generate the iso files. A test for stashing qcow content in the system_area of an iso file is also included. This change modifies format_inspector.detect_file_format to evaluate all inspectors until they are complete and raise an InvalidDiskInfo exception if multiple formats match. Related-Bug: #2059809 Change-Id: I7e12718fb3e1f77eb8d1cfcb9fa64e8ddeb9e712 (cherry picked from commit b1cc39848ebe9b9cb63141a647bda52a2842ee4b) (cherry picked from commit eeda7c333c773216c216159926673874ce4843ba) (cherry picked from commit 24628ecbbe9d5fdd4fe6767ca92395f0d3da9e48) (cherry picked from commit 65f0789df05e2ba7f11c0eaf2c6959367acbced2) --- nova/image/format_inspector.py | 109 +++++++++++++++++- .../tests/unit/image/test_format_inspector.py | 106 ++++++++++++++--- nova/tests/unit/virt/test_images.py | 28 +++++ nova/virt/images.py | 5 + 4 files changed, 230 insertions(+), 18 deletions(-) diff --git a/nova/image/format_inspector.py b/nova/image/format_inspector.py index 8e57d7ed2c4..49cb75930a9 100644 --- a/nova/image/format_inspector.py +++ b/nova/image/format_inspector.py @@ -24,6 +24,7 @@ import struct from oslo_log import log as logging +from oslo_utils import units LOG = logging.getLogger(__name__) @@ -843,6 +844,93 @@ def __str__(self): return 'vdi' +class ISOInspector(FileInspector): + """ISO 9660 and UDF format + + we need to check the first 32KB + descriptor size + to look for the ISO 9660 or UDF signature. + + http://wiki.osdev.org/ISO_9660 + http://wiki.osdev.org/UDF + mkisofs --help | grep udf + + The Universal Disc Format or UDF is the filesystem used on DVDs and + Blu-Ray discs.UDF is an extension of ISO 9660 and shares the same + header structure and initial layout. + + Like the CDFS(ISO 9660) file system, + the UDF file system uses a 2048 byte sector size, + and it designates that the first 16 sectors can be used by the OS + to store proprietary data or boot logic. + + That means we need to check the first 32KB + descriptor size + to look for the ISO 9660 or UDF signature. + both formats have an extent based layout, so we can't determine + ahead of time where the descriptor will be located. + + fortunately, the ISO 9660 and UDF formats have a Primary Volume Descriptor + located at the beginning of the image, which contains the volume size. + + """ + + def __init__(self, *a, **k): + super(ISOInspector, self).__init__(*a, **k) + self.new_region('system_area', CaptureRegion(0, 32 * units.Ki)) + self.new_region('header', CaptureRegion(32 * units.Ki, 2 * units.Ki)) + + @property + def format_match(self): + if not self.complete: + return False + signature = self.region('header').data[1:6] + assert len(signature) == 5 + return signature in (b'CD001', b'NSR02', b'NSR03') + + @property + def virtual_size(self): + if not self.complete: + return 0 + if not self.format_match: + return 0 + + # the header size is 2KB or 1 sector + # the first header field is the descriptor type which is 1 byte + # the second field is the standard identifier which is 5 bytes + # the third field is the version which is 1 byte + # the rest of the header contains type specific data is 2041 bytes + # see http://wiki.osdev.org/ISO_9660#The_Primary_Volume_Descriptor + + # we need to check that the descriptor type is 1 + # to ensure that this is a primary volume descriptor + descriptor_type = self.region('header').data[0] + if descriptor_type != 1: + return 0 + # The size in bytes of a logical block is stored at offset 128 + # and is 2 bytes long encoded in both little and big endian + # int16_LSB-MSB so the field is 4 bytes long + logical_block_size_data = self.region('header').data[128:132] + assert len(logical_block_size_data) == 4 + # given the encoding we only need to read half the field so we + # can use the first 2 bytes which are the little endian part + # this is normally 2048 or 2KB but we need to check as it can be + # different according to the ISO 9660 standard. + logical_block_size, = struct.unpack(' 1: + all_formats = [str(inspector) for inspector in detections] + raise ImageFormatError( + 'Multiple formats detected: %s' % ', '.join(all_formats)) + + return inspectors['raw'] if not detections else detections[0] diff --git a/nova/tests/unit/image/test_format_inspector.py b/nova/tests/unit/image/test_format_inspector.py index 9bd99c03cca..40012503207 100644 --- a/nova/tests/unit/image/test_format_inspector.py +++ b/nova/tests/unit/image/test_format_inspector.py @@ -54,7 +54,13 @@ def tearDown(self): except Exception: pass - def _create_iso(self, image_size, subformat='iso-9660'): + def _create_iso(self, image_size, subformat='9660'): + """Create an ISO file of the given size. + + :param image_size: The size of the image to create in bytes + :param subformat: The subformat to use, if any + """ + # these tests depend on mkisofs # being installed and in the path, # if it is not installed, skip @@ -86,12 +92,22 @@ def _create_iso(self, image_size, subformat='iso-9660'): 'dd if=/dev/zero of=%s bs=1M count=%i' % (fn, size), shell=True) subprocess.check_output( - '%s -o %s -V "TEST" -J -r %s' % (base_cmd, fn, fn), + '%s -V "TEST" -o %s %s' % (base_cmd, fn, fn), shell=True) return fn - def _create_img(self, fmt, size, subformat=None, options=None, - backing_file=None): + def _create_img( + self, fmt, size, subformat=None, options=None, + backing_file=None): + """Create an image file of the given format and size. + + :param fmt: The format to create + :param size: The size of the image to create in bytes + :param subformat: The subformat to use, if any + :param options: A dictionary of options to pass to the format + :param backing_file: The backing file to use, if any + """ + if fmt == 'iso': return self._create_iso(size, subformat) @@ -177,6 +193,13 @@ def _test_format_at_block_size(self, format_name, img, block_size): def _test_format_at_image_size(self, format_name, image_size, subformat=None): + """Test the format inspector for the given format at the + given image size. + + :param format_name: The format to test + :param image_size: The size of the image to create in bytes + :param subformat: The subformat to use, if any + """ img = self._create_img(format_name, image_size, subformat=subformat) # Some formats have internal alignment restrictions making this not @@ -185,7 +208,15 @@ def _test_format_at_image_size(self, format_name, image_size, # Read the format in various sizes, some of which will read whole # sections in a single read, others will be completely unaligned, etc. - for block_size in (64 * units.Ki, 512, 17, 1 * units.Mi): + block_sizes = [64 * units.Ki, 1 * units.Mi] + # ISO images have a 32KB system area at the beginning of the image + # as a result reading that in 17 or 512 byte blocks takes too long, + # causing the test to fail. The 64KiB block size is enough to read + # the system area and header in a single read. the 1MiB block size + # adds very little time to the test so we include it. + if format_name != 'iso': + block_sizes.extend([17, 512]) + for block_size in block_sizes: fmt = self._test_format_at_block_size(format_name, img, block_size) self.assertTrue(fmt.format_match, 'Failed to match %s at size %i block %i' % ( @@ -210,14 +241,63 @@ def test_qcow2(self): self._test_format('qcow2') def test_iso_9660(self): - # reproduce iso-9660 format regression - self.assertRaises( - TypeError, self._test_format, 'iso', subformat='iso-9660') - - def test_udf(self): - # reproduce udf format regression - self.assertRaises( - TypeError, self._test_format, 'iso', subformat='udf') + self._test_format('iso', subformat='9660') + + def test_iso_udf(self): + self._test_format('iso', subformat='udf') + + def _generate_bad_iso(self): + # we want to emulate a malicious user who uploads a an + # ISO file has a qcow2 header in the system area + # of the ISO file + # we will create a qcow2 image and an ISO file + # and then copy the qcow2 header to the ISO file + # e.g. + # mkisofs -o orig.iso /etc/resolv.conf + # qemu-img create orig.qcow2 -f qcow2 64M + # dd if=orig.qcow2 of=outcome bs=32K count=1 + # dd if=orig.iso of=outcome bs=32K skip=1 seek=1 + + qcow = self._create_img('qcow2', 10 * units.Mi) + iso = self._create_iso(64 * units.Mi, subformat='9660') + # first ensure the files are valid + iso_fmt = self._test_format_at_block_size('iso', iso, 4 * units.Ki) + self.assertTrue(iso_fmt.format_match) + qcow_fmt = self._test_format_at_block_size('qcow2', qcow, 4 * units.Ki) + self.assertTrue(qcow_fmt.format_match) + # now copy the qcow2 header to an ISO file + prefix = TEST_IMAGE_PREFIX + prefix += '-bad-' + fn = tempfile.mktemp(prefix=prefix, suffix='.iso') + self._created_files.append(fn) + subprocess.check_output( + 'dd if=%s of=%s bs=32K count=1' % (qcow, fn), + shell=True) + subprocess.check_output( + 'dd if=%s of=%s bs=32K skip=1 seek=1' % (iso, fn), + shell=True) + return qcow, iso, fn + + def test_bad_iso_qcow2(self): + + _, _, fn = self._generate_bad_iso() + + iso_check = self._test_format_at_block_size('iso', fn, 4 * units.Ki) + qcow_check = self._test_format_at_block_size('qcow2', fn, 4 * units.Ki) + # this system area of the ISO file is not considered part of the format + # the qcow2 header is in the system area of the ISO file + # so the ISO file is still valid + self.assertTrue(iso_check.format_match) + # the qcow2 header is in the system area of the ISO file + # but that will be parsed by the qcow2 format inspector + # and it will match + self.assertTrue(qcow_check.format_match) + # if we call format_inspector.detect_file_format it should detect + # and raise an exception because both match internally. + e = self.assertRaises( + format_inspector.ImageFormatError, + format_inspector.detect_file_format, fn) + self.assertIn('Multiple formats detected', str(e)) def test_vhd(self): self._test_format('vhd') diff --git a/nova/tests/unit/virt/test_images.py b/nova/tests/unit/virt/test_images.py index 46c9f9a8b5d..cc285dc4fec 100644 --- a/nova/tests/unit/virt/test_images.py +++ b/nova/tests/unit/virt/test_images.py @@ -235,6 +235,34 @@ def test_fetch_checks_vmdk_rules(self, mock_info, mock_fetch, mock_gi, images.fetch_to_raw, None, 'foo', 'anypath') self.assertIn('Invalid VMDK create-type specified', str(e)) + @mock.patch('os.rename') + @mock.patch.object(images, 'IMAGE_API') + @mock.patch('nova.image.format_inspector.get_inspector') + @mock.patch.object(images, 'fetch') + @mock.patch('nova.privsep.qemu.unprivileged_qemu_img_info') + def test_fetch_iso_is_raw(self, mock_info, mock_fetch, mock_gi, + mock_glance, mock_rename): + mock_glance.get.return_value = {'disk_format': 'iso'} + inspector = mock_gi.return_value.from_file.return_value + inspector.safety_check.return_value = True + # qemu-img does not have a parser for iso so it is treated as raw + info = { + "virtual-size": 356352, + "filename": "foo.iso", + "format": "raw", + "actual-size": 356352, + "dirty-flag": False + } + mock_info.return_value = jsonutils.dumps(info) + with mock.patch('os.path.exists', return_value=True): + images.fetch_to_raw(None, 'foo', 'anypath') + # Make sure we called info with -f raw for an iso, since qemu-img does + # not support iso + mock_info.assert_called_once_with('anypath.part', format='raw') + # Make sure that since we considered this to be a raw file, we did the + # just-rename-don't-convert path + mock_rename.assert_called_once_with('anypath.part', 'anypath') + @mock.patch.object(images, 'IMAGE_API') @mock.patch('nova.image.format_inspector.get_inspector') @mock.patch.object(images, 'qemu_img_info') diff --git a/nova/virt/images.py b/nova/virt/images.py index 5ec0dc0b6ba..813696ed7d7 100644 --- a/nova/virt/images.py +++ b/nova/virt/images.py @@ -171,6 +171,11 @@ def do_image_deep_inspection(img, image_href, path): raise exception.ImageUnacceptable( image_id=image_href, reason=_('Image not in a supported format')) + + if disk_format == 'iso': + # ISO image passed safety check; qemu will treat this as raw from here + disk_format = 'raw' + return disk_format From dae4230fcc1c5539ecab52eb5f7755cc844420cd Mon Sep 17 00:00:00 2001 From: Sean Mooney Date: Tue, 9 Jul 2024 15:09:09 +0100 Subject: [PATCH 55/73] fix qemu-img version dependent tests while backporting Ia34203f246f0bc574e11476287dfb33fda7954fe We observed that several of the tests showed distro specific behavior depending on if qemu was installed in the test env, what version is installed and how it was compiled This change ensures that if qemu is present that it supprot the required formats otherwise it skips the test. Change-Id: I131996cdd7aaf1f52d4caac33b153753ff6db869 (cherry picked from commit cc2514d02e0b0ebaf60a46d02732f7f8facc3191) (cherry picked from commit ae10fde55b113bc0a34bc69ff63bab809bc98ef3) (cherry picked from commit bb2645e92c98da0e02d650dab5ab90cafcbb824b) (cherry picked from commit 673103fd63a516dad3f6da14b95d34f9dd605c21) --- .../tests/unit/image/test_format_inspector.py | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/nova/tests/unit/image/test_format_inspector.py b/nova/tests/unit/image/test_format_inspector.py index 40012503207..a8e688b8eb3 100644 --- a/nova/tests/unit/image/test_format_inspector.py +++ b/nova/tests/unit/image/test_format_inspector.py @@ -111,18 +111,22 @@ def _create_img( if fmt == 'iso': return self._create_iso(size, subformat) - # these tests depend on qemu-img - # being installed and in the path, - # if it is not installed, skip - try: - subprocess.check_output('qemu-img --version', shell=True) - except Exception: - self.skipTest('qemu-img not installed') - if fmt == 'vhd': # QEMU calls the vhd format vpc fmt = 'vpc' + # these tests depend on qemu-img being installed and in the path, + # if it is not installed, skip. we also need to ensure that the + # format is supported by qemu-img, this can vary depending on the + # distribution so we need to check if the format is supported via + # the help output. + try: + subprocess.check_output( + 'qemu-img --help | grep %s' % fmt, shell=True) + except Exception: + self.skipTest( + 'qemu-img not installed or does not support %s format' % fmt) + if options is None: options = {} opt = '' From 11613e7b3244958fa8d0b5253a185287d1ade2d8 Mon Sep 17 00:00:00 2001 From: Balazs Gibizer Date: Thu, 11 Jul 2024 07:29:40 +0200 Subject: [PATCH 56/73] Stabilize iso format unit tests Some version of mkisofs does not properly handle if both the input and the output file of the command are the same. So this commit changes the unit tests depending on that binary to use a different files. Related-Bug: #2059809 Change-Id: I6924eb23ff5804c22a48ec6fabcec25f061906bb (cherry picked from commit c6d8c6972d52845774b36acb84cd08a4b2e4dcde) (cherry picked from commit a8783a767551df3dd943bd862cdba35c51cdb7a6) (cherry picked from commit 02147b36d35e1e462e1405c36a2e67a33de806de) (cherry picked from commit 47428f6caf503b94583dac614b59971f60a0ba9c) --- nova/tests/unit/image/test_format_inspector.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/nova/tests/unit/image/test_format_inspector.py b/nova/tests/unit/image/test_format_inspector.py index a8e688b8eb3..8406dfca378 100644 --- a/nova/tests/unit/image/test_format_inspector.py +++ b/nova/tests/unit/image/test_format_inspector.py @@ -91,10 +91,15 @@ def _create_iso(self, image_size, subformat='9660'): subprocess.check_output( 'dd if=/dev/zero of=%s bs=1M count=%i' % (fn, size), shell=True) + # We need to use different file as input and output as the behavior + # of mkisofs is version dependent if both the input and the output + # are the same and can cause test failures + out_fn = "%s.iso" % fn subprocess.check_output( - '%s -V "TEST" -o %s %s' % (base_cmd, fn, fn), + '%s -V "TEST" -o %s %s' % (base_cmd, out_fn, fn), shell=True) - return fn + self._created_files.append(out_fn) + return out_fn def _create_img( self, fmt, size, subformat=None, options=None, From 5ba1bd1185e09ecc2f77fe7427f9647dce0bdaea Mon Sep 17 00:00:00 2001 From: Stephen Finucane Date: Thu, 15 Dec 2022 00:09:04 +0000 Subject: [PATCH 57/73] Remove use of removeprefix This is not supported on Python 3.8 [1]. I have no idea why this was not failing CI. [1] https://docs.python.org/3.9/library/stdtypes.html#str.removeprefix Change-Id: I225e9ced0f75c415b1d2fee05440291e3d8635c0 Signed-off-by: Stephen Finucane (cherry picked from commit 3ccf82ef9e2c87a1d33a0dda8929c05e80844087) --- nova/tests/unit/console/test_websocketproxy.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nova/tests/unit/console/test_websocketproxy.py b/nova/tests/unit/console/test_websocketproxy.py index fc25bef2bc3..639623bbb58 100644 --- a/nova/tests/unit/console/test_websocketproxy.py +++ b/nova/tests/unit/console/test_websocketproxy.py @@ -635,7 +635,9 @@ def test_reject_open_redirect(self, url='//example.com/%2F..'): # now the same url but with extra leading '/' characters removed. if expected_cpython in errmsg: location = result[3].decode() - location = location.removeprefix('Location: ').rstrip('\r\n') + if location.startswith('Location: '): + location = location[len('Location: '):] + location = location.rstrip('\r\n') self.assertTrue( location.startswith('/example.com/%2F..'), msg='Redirect location is not the expected sanitized URL', From a54125212ff923d055e8efc7d4f2992b0f934679 Mon Sep 17 00:00:00 2001 From: Dan Smith Date: Wed, 10 Jul 2024 14:23:33 +0100 Subject: [PATCH 58/73] Change force_format strategy to catch mismatches When we moved the qemu-img command in fetch_to_raw() to force the format to what we expect, we lost the ability to identify and react to situations where qemu-img detected a file as a format that is not supported by us (i.e. identfied and safety-checked by format_inspector). In the case of some of the other VMDK variants that we don't support, we need to be sure to catch any case where qemu-img thinks it's something other than raw when we think it is, which will be the case for those formats we don't support. Note this also moves us from explicitly using the format_inspector that we're told by glance is appropriate, to using our own detection. We assert that we agree with glance and as above, qemu agrees with us. This helps us avoid cases where the uploader lies about the image format, causing us to not run the appropriate safety check. AMI formats are a liability here since we have a very hard time asserting what they are and what they will be detected as later in the pipeline, so there is still special-casing for those. Closes-Bug: #2071734 Change-Id: I4b792c5bc959a904854c21565682ed3a687baa1a (cherry picked from commit 8b4c522f6699514e7d1f20ac25cf426af6ea588f) (cherry picked from commit 8ef5ec9716c9edbd662ca27b6e39b7848b14f492) (cherry picked from commit 45d948938314997ba400a5fc2bb48bc821c260ab) (cherry picked from commit fbe429051e1fbcd494c71525870651e92e121449) --- nova/tests/unit/virt/libvirt/test_utils.py | 23 +++--- nova/tests/unit/virt/test_images.py | 96 +++++++++++++--------- nova/virt/images.py | 62 +++++++++----- 3 files changed, 108 insertions(+), 73 deletions(-) diff --git a/nova/tests/unit/virt/libvirt/test_utils.py b/nova/tests/unit/virt/libvirt/test_utils.py index 26de82790d9..5bad7d2a779 100644 --- a/nova/tests/unit/virt/libvirt/test_utils.py +++ b/nova/tests/unit/virt/libvirt/test_utils.py @@ -405,12 +405,12 @@ def test_fetch_initrd_image(self, mock_images): _context, image_id, target, trusted_certs) @mock.patch.object(images, 'IMAGE_API') - @mock.patch.object(format_inspector, 'get_inspector') + @mock.patch.object(format_inspector, 'detect_file_format') @mock.patch.object(compute_utils, 'disk_ops_semaphore') @mock.patch('nova.privsep.utils.supports_direct_io', return_value=True) @mock.patch('nova.privsep.qemu.unprivileged_convert_image') def test_fetch_raw_image(self, mock_convert_image, mock_direct_io, - mock_disk_op_sema, mock_gi, mock_glance): + mock_disk_op_sema, mock_detect, mock_glance): def fake_rename(old, new): self.executes.append(('mv', old, new)) @@ -450,7 +450,7 @@ class FakeImgInfo(object): self.stub_out('oslo_utils.fileutils.delete_if_exists', fake_rm_on_error) - mock_inspector = mock_gi.return_value.from_file.return_value + mock_inspector = mock_detect.return_value # Since the remove param of fileutils.remove_path_on_error() # is initialized at load time, we must provide a wrapper @@ -464,6 +464,7 @@ class FakeImgInfo(object): # Make sure qcow2 gets converted to raw mock_inspector.safety_check.return_value = True + mock_inspector.__str__.return_value = 'qcow2' mock_glance.get.return_value = {'disk_format': 'qcow2'} target = 't.qcow2' self.executes = [] @@ -477,12 +478,13 @@ class FakeImgInfo(object): CONF.instances_path, False) mock_convert_image.reset_mock() mock_inspector.safety_check.assert_called_once_with() - mock_gi.assert_called_once_with('qcow2') + mock_detect.assert_called_once_with('t.qcow2.part') # Make sure raw does not get converted - mock_gi.reset_mock() + mock_detect.reset_mock() mock_inspector.safety_check.reset_mock() mock_inspector.safety_check.return_value = True + mock_inspector.__str__.return_value = 'raw' mock_glance.get.return_value = {'disk_format': 'raw'} target = 't.raw' self.executes = [] @@ -491,12 +493,13 @@ class FakeImgInfo(object): self.assertEqual(self.executes, expected_commands) mock_convert_image.assert_not_called() mock_inspector.safety_check.assert_called_once_with() - mock_gi.assert_called_once_with('raw') + mock_detect.assert_called_once_with('t.raw.part') # Make sure safety check failure prevents us from proceeding - mock_gi.reset_mock() + mock_detect.reset_mock() mock_inspector.safety_check.reset_mock() mock_inspector.safety_check.return_value = False + mock_inspector.__str__.return_value = 'qcow2' mock_glance.get.return_value = {'disk_format': 'qcow2'} target = 'backing.qcow2' self.executes = [] @@ -506,10 +509,10 @@ class FakeImgInfo(object): self.assertEqual(self.executes, expected_commands) mock_convert_image.assert_not_called() mock_inspector.safety_check.assert_called_once_with() - mock_gi.assert_called_once_with('qcow2') + mock_detect.assert_called_once_with('backing.qcow2.part') # Make sure a format mismatch prevents us from proceeding - mock_gi.reset_mock() + mock_detect.reset_mock() mock_inspector.safety_check.reset_mock() mock_inspector.safety_check.side_effect = ( format_inspector.ImageFormatError) @@ -522,7 +525,7 @@ class FakeImgInfo(object): self.assertEqual(self.executes, expected_commands) mock_convert_image.assert_not_called() mock_inspector.safety_check.assert_called_once_with() - mock_gi.assert_called_once_with('qcow2') + mock_detect.assert_called_once_with('backing.qcow2.part') del self.executes diff --git a/nova/tests/unit/virt/test_images.py b/nova/tests/unit/virt/test_images.py index cc285dc4fec..2e6a518cd70 100644 --- a/nova/tests/unit/virt/test_images.py +++ b/nova/tests/unit/virt/test_images.py @@ -21,7 +21,6 @@ from nova.compute import utils as compute_utils from nova import exception -from nova.image import format_inspector from nova import test from nova.virt import images @@ -101,15 +100,16 @@ def test_qemu_img_info_with_disk_not_found(self, exists, mocked_execute): mocked_execute.assert_called_once() @mock.patch.object(images, 'IMAGE_API') - @mock.patch('nova.image.format_inspector.get_inspector') + @mock.patch('nova.image.format_inspector.detect_file_format') @mock.patch.object(images, 'convert_image', side_effect=exception.ImageUnacceptable) @mock.patch.object(images, 'qemu_img_info') @mock.patch.object(images, 'fetch') def test_fetch_to_raw_errors(self, convert_image, qemu_img_info, fetch, - get_inspector, glance): - inspector = get_inspector.return_value.from_file.return_value + mock_detect, glance): + inspector = mock_detect.return_value inspector.safety_check.return_value = True + inspector.__str__.return_value = 'qcow2' glance.get.return_value = {'disk_format': 'qcow2'} qemu_img_info.backing_file = None qemu_img_info.file_format = 'qcow2' @@ -120,16 +120,17 @@ def test_fetch_to_raw_errors(self, convert_image, qemu_img_info, fetch, None, 'href123', '/no/path') @mock.patch.object(images, 'IMAGE_API') - @mock.patch('nova.image.format_inspector.get_inspector') + @mock.patch('nova.image.format_inspector.detect_file_format') @mock.patch.object(images, 'convert_image', side_effect=exception.ImageUnacceptable) @mock.patch.object(images, 'qemu_img_info') @mock.patch.object(images, 'fetch') def test_fetch_to_raw_data_file(self, convert_image, qemu_img_info_fn, - fetch, mock_gi, mock_glance): + fetch, mock_detect, mock_glance): mock_glance.get.return_value = {'disk_format': 'qcow2'} - inspector = mock_gi.return_value.from_file.return_value + inspector = mock_detect.return_value inspector.safety_check.return_value = True + inspector.__str__.return_value = 'qcow2' # NOTE(danms): the above test needs the following line as well, as it # is broken without it. qemu_img_info = qemu_img_info_fn.return_value @@ -142,16 +143,17 @@ def test_fetch_to_raw_data_file(self, convert_image, qemu_img_info_fn, images.fetch_to_raw, None, 'href123', '/no/path') - @mock.patch('nova.image.format_inspector.get_inspector') + @mock.patch('nova.image.format_inspector.detect_file_format') @mock.patch.object(images, 'IMAGE_API') @mock.patch('os.rename') @mock.patch.object(images, 'qemu_img_info') @mock.patch.object(images, 'fetch') def test_fetch_to_raw_from_raw(self, fetch, qemu_img_info_fn, mock_rename, - mock_glance, mock_gi): + mock_glance, mock_detect): # Make sure we support a case where we fetch an already-raw image and # qemu-img returns None for "format_specific". mock_glance.get.return_value = {'disk_format': 'raw'} + mock_detect.return_value.__str__.return_value = 'raw' qemu_img_info = qemu_img_info_fn.return_value qemu_img_info.file_format = 'raw' qemu_img_info.backing_file = None @@ -215,14 +217,15 @@ def test_convert_image_vmdk_allowed_list_checking(self): format='json')) @mock.patch.object(images, 'IMAGE_API') - @mock.patch('nova.image.format_inspector.get_inspector') + @mock.patch('nova.image.format_inspector.detect_file_format') @mock.patch.object(images, 'fetch') @mock.patch('nova.privsep.qemu.unprivileged_qemu_img_info') - def test_fetch_checks_vmdk_rules(self, mock_info, mock_fetch, mock_gi, + def test_fetch_checks_vmdk_rules(self, mock_info, mock_fetch, mock_detect, mock_glance): mock_glance.get.return_value = {'disk_format': 'vmdk'} - inspector = mock_gi.return_value.from_file.return_value + inspector = mock_detect.return_value inspector.safety_check.return_value = True + inspector.__str__.return_value = 'vmdk' info = {'format': 'vmdk', 'format-specific': { 'type': 'vmdk', @@ -238,13 +241,17 @@ def test_fetch_checks_vmdk_rules(self, mock_info, mock_fetch, mock_gi, @mock.patch('os.rename') @mock.patch.object(images, 'IMAGE_API') @mock.patch('nova.image.format_inspector.get_inspector') + @mock.patch('nova.image.format_inspector.detect_file_format') @mock.patch.object(images, 'fetch') @mock.patch('nova.privsep.qemu.unprivileged_qemu_img_info') - def test_fetch_iso_is_raw(self, mock_info, mock_fetch, mock_gi, - mock_glance, mock_rename): + def test_fetch_iso_is_raw( + self, mock_info, mock_fetch, mock_detect_file_format, mock_gi, + mock_glance, mock_rename): mock_glance.get.return_value = {'disk_format': 'iso'} inspector = mock_gi.return_value.from_file.return_value inspector.safety_check.return_value = True + inspector.__str__.return_value = 'iso' + mock_detect_file_format.return_value = inspector # qemu-img does not have a parser for iso so it is treated as raw info = { "virtual-size": 356352, @@ -258,27 +265,27 @@ def test_fetch_iso_is_raw(self, mock_info, mock_fetch, mock_gi, images.fetch_to_raw(None, 'foo', 'anypath') # Make sure we called info with -f raw for an iso, since qemu-img does # not support iso - mock_info.assert_called_once_with('anypath.part', format='raw') + mock_info.assert_called_once_with('anypath.part', format=None) # Make sure that since we considered this to be a raw file, we did the # just-rename-don't-convert path mock_rename.assert_called_once_with('anypath.part', 'anypath') @mock.patch.object(images, 'IMAGE_API') - @mock.patch('nova.image.format_inspector.get_inspector') + @mock.patch('nova.image.format_inspector.detect_file_format') @mock.patch.object(images, 'qemu_img_info') @mock.patch.object(images, 'fetch') - def test_fetch_to_raw_inspector(self, fetch, qemu_img_info, mock_gi, + def test_fetch_to_raw_inspector(self, fetch, qemu_img_info, mock_detect, mock_glance): # Image claims to be qcow2, is qcow2, but fails safety check, so we # abort before qemu-img-info mock_glance.get.return_value = {'disk_format': 'qcow2'} - inspector = mock_gi.return_value.from_file.return_value + inspector = mock_detect.return_value inspector.safety_check.return_value = False + inspector.__str__.return_value = 'qcow2' self.assertRaises(exception.ImageUnacceptable, images.fetch_to_raw, None, 'href123', '/no.path') qemu_img_info.assert_not_called() - mock_gi.assert_called_once_with('qcow2') - mock_gi.return_value.from_file.assert_called_once_with('/no.path.part') + mock_detect.assert_called_once_with('/no.path.part') inspector.safety_check.assert_called_once_with() mock_glance.get.assert_called_once_with(None, 'href123') @@ -292,18 +299,17 @@ def test_fetch_to_raw_inspector(self, fetch, qemu_img_info, mock_gi, # Image claims to be qcow2 in glance, but the image is something else, # so we abort before qemu-img-info qemu_img_info.reset_mock() - mock_gi.reset_mock() + mock_detect.reset_mock() inspector.safety_check.reset_mock() - mock_gi.return_value.from_file.side_effect = ( - format_inspector.ImageFormatError) + mock_detect.return_value.__str__.return_value = 'vmdk' self.assertRaises(exception.ImageUnacceptable, images.fetch_to_raw, None, 'href123', '/no.path') - mock_gi.assert_called_once_with('qcow2') - inspector.safety_check.assert_not_called() + mock_detect.assert_called_once_with('/no.path.part') + inspector.safety_check.assert_called_once_with() qemu_img_info.assert_not_called() @mock.patch.object(images, 'IMAGE_API') - @mock.patch('nova.image.format_inspector.get_inspector') + @mock.patch('nova.image.format_inspector.detect_file_format') @mock.patch.object(images, 'qemu_img_info') @mock.patch.object(images, 'fetch') def test_fetch_to_raw_inspector_disabled(self, fetch, qemu_img_info, @@ -316,36 +322,41 @@ def test_fetch_to_raw_inspector_disabled(self, fetch, qemu_img_info, # If deep inspection is disabled, we should never call the inspector mock_gi.assert_not_called() # ... and we let qemu-img detect the format itself. - qemu_img_info.assert_called_once_with('/no.path.part', - format=None) + qemu_img_info.assert_called_once_with('/no.path.part') mock_glance.get.assert_not_called() @mock.patch.object(images, 'IMAGE_API') @mock.patch.object(images, 'qemu_img_info') - def test_fetch_inspect_ami(self, imginfo, glance): + @mock.patch('nova.image.format_inspector.detect_file_format') + def test_fetch_inspect_ami(self, detect, imginfo, glance): glance.get.return_value = {'disk_format': 'ami'} + detect.return_value.__str__.return_value = 'raw' self.assertRaises(exception.ImageUnacceptable, images.fetch_to_raw, None, 'href123', '/no.path') # Make sure 'ami was translated into 'raw' before we call qemu-img - imginfo.assert_called_once_with('/no.path.part', format='raw') + imginfo.assert_called_once_with('/no.path.part') @mock.patch.object(images, 'IMAGE_API') @mock.patch.object(images, 'qemu_img_info') - def test_fetch_inspect_aki(self, imginfo, glance): + @mock.patch('nova.image.format_inspector.detect_file_format') + def test_fetch_inspect_aki(self, detect, imginfo, glance): glance.get.return_value = {'disk_format': 'aki'} + detect.return_value.__str__.return_value = 'raw' self.assertRaises(exception.ImageUnacceptable, images.fetch_to_raw, None, 'href123', '/no.path') # Make sure 'aki was translated into 'raw' before we call qemu-img - imginfo.assert_called_once_with('/no.path.part', format='raw') + imginfo.assert_called_once_with('/no.path.part') @mock.patch.object(images, 'IMAGE_API') @mock.patch.object(images, 'qemu_img_info') - def test_fetch_inspect_ari(self, imginfo, glance): + @mock.patch('nova.image.format_inspector.detect_file_format') + def test_fetch_inspect_ari(self, detect, imginfo, glance): glance.get.return_value = {'disk_format': 'ari'} + detect.return_value.__str__.return_value = 'raw' self.assertRaises(exception.ImageUnacceptable, images.fetch_to_raw, None, 'href123', '/no.path') # Make sure 'aki was translated into 'raw' before we call qemu-img - imginfo.assert_called_once_with('/no.path.part', format='raw') + imginfo.assert_called_once_with('/no.path.part') @mock.patch.object(images, 'IMAGE_API') @mock.patch.object(images, 'qemu_img_info') @@ -358,13 +369,16 @@ def test_fetch_inspect_unknown_format(self, imginfo, glance): @mock.patch.object(images, 'IMAGE_API') @mock.patch.object(images, 'qemu_img_info') - @mock.patch('nova.image.format_inspector.get_inspector') - def test_fetch_inspect_disagrees_qemu(self, mock_gi, imginfo, glance): + @mock.patch('nova.image.format_inspector.detect_file_format') + def test_fetch_inspect_disagrees_qemu(self, mock_detect, imginfo, glance): glance.get.return_value = {'disk_format': 'qcow2'} + mock_detect.return_value.__str__.return_value = 'qcow2' # Glance and inspector think it is a qcow2 file, but qemu-img does not - # agree. It was forced to interpret as a qcow2, but returned no - # format information as a result. + # agree. imginfo.return_value.data_file = None - self.assertRaises(exception.ImageUnacceptable, - images.fetch_to_raw, None, 'href123', '/no.path') - imginfo.assert_called_once_with('/no.path.part', format='qcow2') + imginfo.return_value.file_format = 'vmdk' + ex = self.assertRaises(exception.ImageUnacceptable, + images.fetch_to_raw, + None, 'href123', '/no.path') + self.assertIn('content does not match disk_format', str(ex)) + imginfo.assert_called_once_with('/no.path.part') diff --git a/nova/virt/images.py b/nova/virt/images.py index 813696ed7d7..193c80fb636 100644 --- a/nova/virt/images.py +++ b/nova/virt/images.py @@ -140,42 +140,50 @@ def check_vmdk_image(image_id, data): def do_image_deep_inspection(img, image_href, path): + ami_formats = ('ami', 'aki', 'ari') disk_format = img['disk_format'] try: # NOTE(danms): Use our own cautious inspector module to make sure # the image file passes safety checks. # See https://bugs.launchpad.net/nova/+bug/2059809 for details. - inspector_cls = format_inspector.get_inspector(disk_format) - if not inspector_cls.from_file(path).safety_check(): + + # Make sure we have a format inspector for the claimed format, else + # it is something we do not support and must reject. AMI is excluded. + if (disk_format not in ami_formats and + not format_inspector.get_inspector(disk_format)): + raise exception.ImageUnacceptable( + image_id=image_href, + reason=_('Image not in a supported format')) + + inspector = format_inspector.detect_file_format(path) + if not inspector.safety_check(): raise exception.ImageUnacceptable( image_id=image_href, reason=(_('Image does not pass safety check'))) + + # AMI formats can be other things, so don't obsess over this + # requirement for them. Otherwise, make sure our detection agrees + # with glance. + if disk_format not in ami_formats and str(inspector) != disk_format: + # If we detected the image as something other than glance claimed, + # we abort. + raise exception.ImageUnacceptable( + image_id=image_href, + reason=_('Image content does not match disk_format')) except format_inspector.ImageFormatError: # If the inspector we chose based on the image's metadata does not # think the image is the proper format, we refuse to use it. raise exception.ImageUnacceptable( image_id=image_href, reason=_('Image content does not match disk_format')) - except AttributeError: - # No inspector was found - LOG.warning('Unable to perform deep image inspection on type %r', - img['disk_format']) - if disk_format in ('ami', 'aki', 'ari'): - # A lot of things can be in a UEC, although it is typically a raw - # filesystem. We really have nothing we can do other than treat it - # like a 'raw', which is what qemu-img will detect a filesystem as - # anyway. If someone puts a qcow2 inside, we should fail because - # we won't do our inspection. - disk_format = 'raw' - else: - raise exception.ImageUnacceptable( - image_id=image_href, - reason=_('Image not in a supported format')) - - if disk_format == 'iso': - # ISO image passed safety check; qemu will treat this as raw from here + except Exception: + raise exception.ImageUnacceptable( + image_id=image_href, + reason=_('Image not in a supported format')) + if disk_format in ('iso',) + ami_formats: + # ISO or AMI image passed safety check; qemu will treat this as raw + # from here so return the expected formats it will find. disk_format = 'raw' - return disk_format @@ -194,12 +202,22 @@ def fetch_to_raw(context, image_href, path, trusted_certs=None): # Only run qemu-img after we have done deep inspection (if enabled). # If it was not enabled, we will let it detect the format. - data = qemu_img_info(path_tmp, format=force_format) + data = qemu_img_info(path_tmp) fmt = data.file_format if fmt is None: raise exception.ImageUnacceptable( reason=_("'qemu-img info' parsing failed."), image_id=image_href) + elif force_format is not None and fmt != force_format: + # Format inspector and qemu-img must agree on the format, else + # we reject. This will catch VMDK some variants that we don't + # explicitly support because qemu will identify them as such + # and we will not. + LOG.warning('Image %s detected by qemu as %s but we expected %s', + image_href, fmt, force_format) + raise exception.ImageUnacceptable( + image_id=image_href, + reason=_('Image content does not match disk_format')) backing_file = data.backing_file if backing_file is not None: From fe0eb7ad395bad4c3321e4efce9d77506cbf134d Mon Sep 17 00:00:00 2001 From: Elod Illes Date: Fri, 12 Jul 2024 22:53:06 +0200 Subject: [PATCH 59/73] [tools] Ignore bot generated patches This is a fix for the test whether a patch is bot generated or not, as that did not worked as intended. The problem is that the script is checking the email address of the parent patch (HEAD~), which probably should be right in case the patch would be a MERGE patch. But this is wrong in case the patch is not a MERGE patch. This fix uses the very same pattern as it is using for the commit message parsing: the $commit_hash variable, which is the parent's commit hash if the patch is a MERGE patch, and an empty string in the other case (causing to call 'git show' on HEAD). Change-Id: I0abc72180edf34a6dd0624a40fb8682397805eca (cherry picked from commit b8f3975d3641fad19971cc159bdb9decb6ea95f8) (cherry picked from commit 92b781f96e076f22ef098ca7894a3eeddb647731) (cherry picked from commit 7a914d6bfc5467e91175c55c8ea63e62e3518d86) (cherry picked from commit 8b79f0f6a4315a89cb90ea86c7e05dfde3b1fc92) --- tools/check-cherry-picks.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/check-cherry-picks.sh b/tools/check-cherry-picks.sh index 3042aa16593..439f42df3af 100755 --- a/tools/check-cherry-picks.sh +++ b/tools/check-cherry-picks.sh @@ -14,7 +14,7 @@ if [ $parent_number -eq 2 ]; then commit_hash=$(git show --format='%P' --quiet | awk '{print $NF}') fi -if git show --format='%aE' HEAD~ --quiet | grep -qi 'infra-root@openstack.org'; then +if git show --format='%aE' --quiet $commit_hash | grep -qi 'infra-root@openstack.org'; then echo 'Bot generated change; ignoring' exit 0 fi From f43ceef5769f1bfbeddf062f3fd745fc3c519ace Mon Sep 17 00:00:00 2001 From: Elod Illes Date: Fri, 12 Jul 2024 23:10:26 +0200 Subject: [PATCH 60/73] [tools] Backport validator: handle unmaintained When the script was created there were only stable/* branches, but now there are unmaintained/* branches as well, where the validator fails when looking for hashes only on stable/* branches even if the given hash is already on unmtaintained/* branch. This patch matches now both stable/* and unmaintained/* branches. Change-Id: I08fcc63ab0fbe5af1be70d5fde5af98bf006101c (cherry picked from commit e2697de8e41a566eb86aefa364906bda9bc59863) (cherry picked from commit 602e68364c54fb54140006f38d6995b9a5b354a9) (cherry picked from commit 56e73cc7bad51435a79584e9411f07add0d0536a) (cherry picked from commit f53824f95bea8769a2b28c62f23e57cb8dbafae5) --- tools/check-cherry-picks.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/check-cherry-picks.sh b/tools/check-cherry-picks.sh index 439f42df3af..74887a9178b 100755 --- a/tools/check-cherry-picks.sh +++ b/tools/check-cherry-picks.sh @@ -1,7 +1,8 @@ #!/bin/sh # # A tool to check the cherry-pick hashes from the current git commit message -# to verify that they're all on either master or stable/ branches +# to verify that they're all on either master, stable/ or unmaintained/ +# branches # commit_hash="" @@ -23,9 +24,9 @@ hashes=$(git show --format='%b' --quiet $commit_hash | sed -nr 's/^.cherry picke checked=0 branches+="" for hash in $hashes; do - branch=$(git branch -a --contains "$hash" 2>/dev/null| grep -oE '(master|stable/[a-z0-9.]+)') + branch=$(git branch -a --contains "$hash" 2>/dev/null| grep -oE '(master|stable/[a-z0-9.]+|unmaintained/[a-z0-9.]+)') if [ $? -ne 0 ]; then - echo "Cherry pick hash $hash not on any master or stable branches" + echo "Cherry pick hash $hash not on any master, stable or unmaintained branches" exit 1 fi branches+=" $branch" @@ -33,7 +34,7 @@ for hash in $hashes; do done if [ $checked -eq 0 ]; then - if ! grep -q '^defaultbranch=stable/' .gitreview; then + if ! grep -qE '^defaultbranch=(stable|unmaintained)/' .gitreview; then echo "Checked $checked cherry-pick hashes: OK" exit 0 else From ffc252eeae01f4829a92a0549b47fb9e4175c2da Mon Sep 17 00:00:00 2001 From: Elod Illes Date: Tue, 18 Jun 2024 15:10:13 +0200 Subject: [PATCH 61/73] [CI] Replace deprecated regex Latest Zuul drops the following warnings: All regular expressions must conform to RE2 syntax, but an expression using the deprecated Perl-style syntax has been detected. Adjust the configuration to conform to RE2 syntax. The RE2 syntax error is: invalid perl operator: (?! This patch replaces the 'irrelevant-files' to 'files' with explicitly listing the pattern which files should be the tests run against. Change-Id: If287e800fb9ff428dbe6f9c4c046627f22afe3df (cherry picked from commit 9b77bae8a32ff41712b96bb6a67c7eacae45a4c9) (cherry picked from commit 8223e6a7c429441c28178316e455767a66d3e8f8) (cherry picked from commit 510a27ba36fc47309308228bc45b5b9ea6ba695b) (cherry picked from commit 197b14d7659252c62b7436bcdd2a9b8c8b470771) --- .zuul.yaml | 54 ++++++++++++++++-------------------------------------- 1 file changed, 16 insertions(+), 38 deletions(-) diff --git a/.zuul.yaml b/.zuul.yaml index 15e44f81fe7..1c39a1c3438 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -224,24 +224,11 @@ parent: devstack-tempest description: | Run tempest compute API tests using LVM image backend. This only runs - against nova/virt/libvirt/* changes. - # Copy irrelevant-files from nova-dsvm-multinode-base and then exclude - # anything that is not in nova/virt/libvirt/* or nova/privsep/*. - irrelevant-files: - - ^(?!.zuul.yaml)(?!nova/virt/libvirt/)(?!nova/privsep/).*$ - - ^api-.*$ - - ^(test-|)requirements.txt$ - - ^.*\.rst$ - - ^.git.*$ - - ^doc/.*$ - - ^nova/hacking/.*$ - - ^nova/locale/.*$ - - ^nova/tests/.*$ - - ^nova/test.py$ - - ^releasenotes/.*$ - - ^setup.cfg$ - - ^tools/.*$ - - ^tox.ini$ + against nova/virt/libvirt/*, nova/privsep/* and .zuul.yaml changes. + files: + - ^nova/virt/libvirt/.*$ + - ^nova/privsep/.*$ + - .zuul.yaml vars: # We use the "all" environment for tempest_test_regex and # tempest_exclude_regex. @@ -280,22 +267,11 @@ # NOTE(chateaulav): due to constraints with no IDE support for aarch64, # tests have been limited to eliminate any items that are incompatible. # This is to be re-evaluated as greater support is added and defined. - irrelevant-files: - - ^(?!.zuul.yaml)(?!nova/virt/libvirt/)(?!nova/objects/)(?!nova/scheduler/).*$ - - ^api-.*$ - - ^(test-|)requirements.txt$ - - ^.*\.rst$ - - ^.git.*$ - - ^doc/.*$ - - ^nova/hacking/.*$ - - ^nova/locale/.*$ - - ^nova/policies/.*$ - - ^nova/tests/.*$ - - ^nova/test.py$ - - ^releasenotes/.*$ - - ^setup.cfg$ - - ^tools/.*$ - - ^tox.ini$ + files: + - ^nova/virt/libvirt/.*$ + - ^nova/objects/.*$ + - ^nova/scheduler/.*$ + - .zuul.yaml vars: tox_envlist: all tempest_test_regex: ^tempest\.(api\.compute\.servers|scenario\.test_network_basic_ops) @@ -670,11 +646,12 @@ - nova-ceph-multistore: irrelevant-files: *nova-base-irrelevant-files - neutron-linuxbridge-tempest: - irrelevant-files: + files: # NOTE(mriedem): This job has its own irrelevant-files section # so that we only run it on changes to networking and libvirt/vif # code; we don't need to run this on all changes. - - ^(?!nova/network/.*)(?!nova/virt/libvirt/vif.py).*$ + - ^nova/network/.*$ + - nova/virt/libvirt/vif.py - nova-live-migration - nova-live-migration-ceph - nova-lvm @@ -734,11 +711,12 @@ - nova-ceph-multistore: irrelevant-files: *nova-base-irrelevant-files - neutron-linuxbridge-tempest: - irrelevant-files: + files: # NOTE(mriedem): This job has its own irrelevant-files section # so that we only run it on changes to networking and libvirt/vif # code; we don't need to run this on all changes. - - ^(?!nova/network/.*)(?!nova/virt/libvirt/vif.py).*$ + - ^nova/network/.*$ + - nova/virt/libvirt/vif.py - tempest-integrated-compute: irrelevant-files: *policies-irrelevant-files - nova-grenade-multinode: From e6f4503fe3e20c3fb8afef0aa944d4994665786b Mon Sep 17 00:00:00 2001 From: Dan Smith Date: Wed, 24 Jul 2024 09:01:31 -0700 Subject: [PATCH 62/73] Remove AMI snapshot format special case Note that this includes seemingly-unrelated test changes because we were actually skipping the snapshot_running test for libvirt, which has been a bug for years. In that test case, when we went to look for image_meta.disk_format, that attribute was not set on the o.vo object, which raised a NotImplementedError. That error is also checked by the test to skip the test for drivers that do not support snapshot, which meant that for libvirt, we haven't been running that case beyond the point at which we create snapshot metadata and trip that exception. Thus, once removing that, there are other mocks not in place that are required for the test to actually run. So, this adds mocks for qemu_img_info() calls that actually try to read the file on disk, as well as the privsep chown() that attempts to run after. Change-Id: Ie731045629f0899840a4680d21793a16ade9b98e (cherry picked from commit d5a631ba7791b37e49213707e4ea650a56d2ed9e) (cherry picked from commit 8c5929ff5156d5409d41872f1b8ee0abb04f35a8) (cherry picked from commit d2d3b2c9e87fe2247a34a776310221c8b12be515) (cherry picked from commit 77dfa4f6f3c39048b5d3bb9eb2b14dd6998b406b) --- nova/tests/unit/virt/libvirt/test_driver.py | 11 +++++++---- nova/tests/unit/virt/test_virt_drivers.py | 5 +++++ nova/virt/libvirt/driver.py | 6 +----- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index a34a20619f0..47612a6db8c 100644 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -9242,7 +9242,7 @@ def test_unquiesce(self, mock_has_min_version): def test_create_snapshot_metadata(self): base = objects.ImageMeta.from_dict( - {'disk_format': 'raw'}) + {'disk_format': 'qcow2'}) instance_data = {'kernel_id': 'kernel', 'project_id': 'prj_id', 'ramdisk_id': 'ram_id', @@ -9274,10 +9274,12 @@ def test_create_snapshot_metadata(self): {'disk_format': 'ami', 'container_format': 'test_container'}) expected['properties']['os_type'] = instance['os_type'] - expected['disk_format'] = base.disk_format + # The disk_format of the snapshot should be the *actual* format of the + # thing we upload, regardless of what type of image we booted from. + expected['disk_format'] = img_fmt expected['container_format'] = base.container_format ret = drvr._create_snapshot_metadata(base, instance, img_fmt, snp_name) - self.assertEqual(ret, expected) + self.assertEqual(expected, ret) def test_get_volume_driver(self): conn = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) @@ -28778,7 +28780,8 @@ def test_ami(self): utils.get_system_metadata_from_image( {'disk_format': 'ami'}) - self._test_snapshot(disk_format='ami') + # If we're uploading a qcow2, we must set the disk_format as such + self._test_snapshot(disk_format='qcow2') @mock.patch('nova.virt.libvirt.utils.get_disk_type_from_path', new=mock.Mock(return_value=None)) diff --git a/nova/tests/unit/virt/test_virt_drivers.py b/nova/tests/unit/virt/test_virt_drivers.py index ed9f1e3822d..802ea4f027f 100644 --- a/nova/tests/unit/virt/test_virt_drivers.py +++ b/nova/tests/unit/virt/test_virt_drivers.py @@ -838,6 +838,11 @@ def setUp(self): # since we don't care about it. self.stub_out('os_vif.unplug', lambda a, kw: None) self.stub_out('nova.compute.utils.get_machine_ips', lambda: []) + self.stub_out('nova.virt.libvirt.utils.get_disk_size', + lambda *a, **k: 123456) + self.stub_out('nova.virt.libvirt.utils.get_disk_backing_file', + lambda *a, **k: None) + self.stub_out('nova.privsep.path.chown', lambda *a, **k: None) def test_init_host_image_type_rbd_force_raw_images_true(self): CONF.set_override('images_type', 'rbd', group='libvirt') diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index e8bfffc3fa9..b88103862d6 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -2938,11 +2938,7 @@ def _create_snapshot_metadata(self, image_meta, instance, if instance.os_type: metadata['properties']['os_type'] = instance.os_type - # NOTE(vish): glance forces ami disk format to be ami - if image_meta.disk_format == 'ami': - metadata['disk_format'] = 'ami' - else: - metadata['disk_format'] = img_fmt + metadata['disk_format'] = img_fmt if image_meta.obj_attr_is_set("container_format"): metadata['container_format'] = image_meta.container_format From 25aafcfce38711eb61a653fb0d09162ea72c5c84 Mon Sep 17 00:00:00 2001 From: jskunda Date: Wed, 21 Jun 2023 12:11:05 +0200 Subject: [PATCH 63/73] Drop Fedora support We are about to drop Fedora support as the latest image in upstream has been transitioned to EOL. Centos 9 Stream has evolved as replacement platform for new features. Patch which removes fedora jobs and nodeset from devstack: https://review.opendev.org/c/openstack/devstack/+/885467 This is needed for https://review.opendev.org/c/openstack/devstack/+/925837 Change-Id: Ib7d3dd93602c94fd801f8fe5daa26353b04f589b (cherry picked from commit 86c542c56a1da23b1ba71cf2f6f2b76332c3b0a6) (cherry picked from commit fde9368dd7c50c2e5601d5683ded60677c657dc8) --- .zuul.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.zuul.yaml b/.zuul.yaml index 1c39a1c3438..91e96efd083 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -752,10 +752,6 @@ irrelevant-files: *nova-base-irrelevant-files - os-vif-ovs: irrelevant-files: *nova-base-irrelevant-files - - devstack-platform-fedora-latest: - irrelevant-files: *nova-base-irrelevant-files - - devstack-platform-fedora-latest-virt-preview: - irrelevant-files: *nova-base-irrelevant-files - devstack-plugin-ceph-compute-local-ephemeral: irrelevant-files: *nova-base-irrelevant-files - devstack-tobiko-nova: From a6700bcb7c0e0364ee991f27728df86a06bf0a46 Mon Sep 17 00:00:00 2001 From: Elod Illes Date: Tue, 10 Sep 2024 18:45:30 +0200 Subject: [PATCH 64/73] [stable-only][CI] remove tempest-centos8-stream-fips periodic CentOS Stream 8 became End of Life and centos-8-stream nodeset was removed from Zuul configuration. All jobs that uses this nodeset should be deleted. Change-Id: I6b625e57a45906b57c0a339fa5f632c7a34abcf7 --- .zuul.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.zuul.yaml b/.zuul.yaml index 91e96efd083..c41933f1c8d 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -761,7 +761,3 @@ - nova-emulation - tempest-integrated-compute-centos-9-stream: irrelevant-files: *nova-base-irrelevant-files - periodic: - jobs: - - tempest-centos8-stream-fips: - branches: master From 6c9ebea5906d961e31c26d2cc035ce48671911a7 Mon Sep 17 00:00:00 2001 From: Dan Smith Date: Tue, 21 Feb 2023 08:43:13 -0800 Subject: [PATCH 65/73] Use mysql memory reduction flags for ceph job This makes the ceph-multistore job use the MYSQL_REDUCE_MEMORY flag in devstack to try to address the frequent OOMs we see in that job. Conflicts: .zuul.yaml Change-Id: Ibc203bd10dcb530027c2c9f58eb840ccc088280d Closes-Bug: #1961068 (cherry picked from commit 84d1f25446731e4e51beb83a017cdf7bfda8c5d5) --- .zuul.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.zuul.yaml b/.zuul.yaml index c41933f1c8d..8f092bdfeb2 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -585,6 +585,7 @@ GLANCE_USE_IMPORT_WORKFLOW: True DEVSTACK_PARALLEL: True CEPH_MIN_CLIENT_VERSION: "mimic" + MYSQL_REDUCE_MEMORY: True # NOTE(danms): This job is pretty heavy as it is, so we disable some # services that are not relevant to the nova-glance-ceph scenario # that this job is intended to validate. From 200d375b320af489938eb776ee9db9c4f68b3fab Mon Sep 17 00:00:00 2001 From: Elod Illes Date: Wed, 13 Nov 2024 17:30:43 +0100 Subject: [PATCH 66/73] [stable-only] Remove unnecessary periodic CI jobs Since this branch is in Unmaintained phase we don't need the listed periodic-weekly jobs to run. They can be run on-demand if needed, but let's spare some CI resource by removing them here. Note that periodic-stable-jobs template is also unnecessary, because the template only runs jobs against stable branches. Change-Id: Iabe1a62681e9d274a8ef5200eca4ca23b6dd138b --- .zuul.yaml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/.zuul.yaml b/.zuul.yaml index 8f092bdfeb2..e46eac85f5a 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -634,7 +634,6 @@ - openstack-cover-jobs - openstack-python3-zed-jobs - openstack-python3-zed-jobs-arm64 - - periodic-stable-jobs - publish-openstack-docs-pti - release-notes-jobs-python3 check: @@ -726,12 +725,6 @@ irrelevant-files: *nova-base-irrelevant-files - openstacksdk-functional-devstack: irrelevant-files: *nova-base-irrelevant-files - periodic-weekly: - jobs: - # Runs emulation feature functionality test less frequently due - # to being the initial release and experimental in nature. - - nova-emulation - - tempest-centos9-stream-fips experimental: jobs: - ironic-tempest-bfv: From 44995b430a604ec24e56f49f9dde27c65ed8cc45 Mon Sep 17 00:00:00 2001 From: "zhong.zhou" Date: Wed, 17 Jul 2024 18:29:46 +0800 Subject: [PATCH 67/73] nova-manage: modify image properties in request_spec At present, we can modify the properties in the instance system_metadata through the sub command image_property of nova-manage, but there may be inconsistencies between their values and those in request_specs. And the migration is based on request_specs, so the same image properties are also written to request_specs. Closes-Bug: 2078999 Change-Id: Id36ecd022cb6f7f9a0fb131b0d202b79715870a9 (cherry picked from commit 2a1fad41453ca7ce15b1cd9b517055c4ccdd12cf) (cherry picked from commit ebae97c62f1af6b3b9f6da2abfa920d6528ddb1b) (cherry picked from commit ee30457accabcea10a62652d14d2cf08a6d57ac0) (cherry picked from commit 3fe5c69b73f01a95fa6df017ea0557298fd6126c) --- nova/cmd/manage.py | 10 ++++++++-- nova/tests/unit/cmd/test_manage.py | 14 ++++++++++++-- ...mage-property-bug-2078999-c493fc259d316c24.yaml | 8 ++++++++ 3 files changed, 28 insertions(+), 4 deletions(-) create mode 100644 releasenotes/notes/nova-manage-image-property-bug-2078999-c493fc259d316c24.yaml diff --git a/nova/cmd/manage.py b/nova/cmd/manage.py index 08b8ebb3104..5b655be35f2 100644 --- a/nova/cmd/manage.py +++ b/nova/cmd/manage.py @@ -3258,9 +3258,10 @@ def _validate_image_properties(self, image_properties): # Return the dict so we can update the instance system_metadata return image_properties - def _update_image_properties(self, instance, image_properties): + def _update_image_properties(self, ctxt, instance, image_properties): """Update instance image properties + :param ctxt: nova.context.RequestContext :param instance: The instance to update :param image_properties: List of image properties and values to update """ @@ -3284,8 +3285,13 @@ def _update_image_properties(self, instance, image_properties): for image_property, value in image_properties.items(): instance.system_metadata[f'image_{image_property}'] = value + request_spec = objects.RequestSpec.get_by_instance_uuid( + ctxt, instance.uuid) + request_spec.image = instance.image_meta + # Save and return 0 instance.save() + request_spec.save() return 0 @action_description(_( @@ -3320,7 +3326,7 @@ def set(self, instance_uuid=None, image_properties=None): instance = objects.Instance.get_by_uuid( cctxt, instance_uuid, expected_attrs=['system_metadata']) return self._update_image_properties( - instance, image_properties) + ctxt, instance, image_properties) except ValueError as e: print(str(e)) return 6 diff --git a/nova/tests/unit/cmd/test_manage.py b/nova/tests/unit/cmd/test_manage.py index 10c1a77c948..7c44bc6e8fb 100644 --- a/nova/tests/unit/cmd/test_manage.py +++ b/nova/tests/unit/cmd/test_manage.py @@ -4100,6 +4100,8 @@ def test_show_image_properties_unknown_failure( image_property='hw_disk_bus') self.assertEqual(1, ret, 'return code') + @mock.patch('nova.objects.RequestSpec.save') + @mock.patch('nova.objects.RequestSpec.get_by_instance_uuid') @mock.patch('nova.objects.Instance.get_by_uuid') @mock.patch('nova.context.target_cell') @mock.patch('nova.objects.Instance.save') @@ -4108,7 +4110,8 @@ def test_show_image_properties_unknown_failure( @mock.patch('nova.context.get_admin_context', new=mock.Mock(return_value=mock.sentinel.ctxt)) def test_set_image_properties( - self, mock_instance_save, mock_target_cell, mock_get_instance + self, mock_instance_save, mock_target_cell, mock_get_instance, + mock_get_request_spec, mock_request_spec_save ): mock_target_cell.return_value.__enter__.return_value = \ mock.sentinel.cctxt @@ -4117,9 +4120,11 @@ def test_set_image_properties( vm_state=obj_fields.InstanceState.STOPPED, system_metadata={ 'image_hw_disk_bus': 'virtio', - } + }, + image_ref='' ) mock_get_instance.return_value = instance + mock_get_request_spec.return_value = objects.RequestSpec() ret = self.commands.set( instance_uuid=uuidsentinel.instance, image_properties=['hw_cdrom_bus=sata'] @@ -4136,7 +4141,12 @@ def test_set_image_properties( instance.system_metadata.get('image_hw_disk_bus'), 'image_hw_disk_bus' ) + image_props = mock_get_request_spec.return_value.image.properties + self.assertEqual('sata', image_props.get('hw_cdrom_bus')) + self.assertEqual('virtio', image_props.get('hw_disk_bus')) + mock_instance_save.assert_called_once() + mock_request_spec_save.assert_called_once() @mock.patch('nova.objects.Instance.get_by_uuid') @mock.patch('nova.objects.InstanceMapping.get_by_instance_uuid', diff --git a/releasenotes/notes/nova-manage-image-property-bug-2078999-c493fc259d316c24.yaml b/releasenotes/notes/nova-manage-image-property-bug-2078999-c493fc259d316c24.yaml new file mode 100644 index 00000000000..03123855e0e --- /dev/null +++ b/releasenotes/notes/nova-manage-image-property-bug-2078999-c493fc259d316c24.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + Before the `Bug 2078999 `_ was fixed, + the ``nova-manage image_property set`` command would update the image properties + embedded in the instance but would not update the ones in the request specs. This + led to an unexpected rollback of the image properties that were updated by the + command after an instance migration. From 26e815dbe77006c97f24a9e32a7ed5b1e9612538 Mon Sep 17 00:00:00 2001 From: Zhang Hua Date: Fri, 24 May 2024 15:49:12 +0800 Subject: [PATCH 68/73] Fix deepcopy usage for BlockDeviceMapping in get_root_info The method get_root_info sometimes receives a BlockDeviceMapping object, which lacks a copy method. The previous code assumed root_bdm was always an instance of DriverBlockDevice, a subclass of dict that supports the copy() method. However, during testing, it was discovered that root_bdm could also be a BlockDeviceMapping object, which does not have a copy method. To address this, the change replaces the copy() call with copy.deepcopy() according to the suggestion in the comment [1], which works for both BlockDeviceMapping and DriverBlockDevice instances. The deepcopy method is supported because oslo.versionedobjects implements the __deepcopy__ method. This change ensures the function handles both object types correctly, preventing the AttributeError observed during testing. [1] https://review.opendev.org/c/openstack/nova/+/909611/4/nova/virt/libvirt/blockinfo.py Change-Id: I9432718586855ff57e8e6a5cae064e0685dd01e8 (cherry picked from commit 065bf99fc79a3d086e1859f9542afaafa8c3bf00) Signed-off-by: Zhang Hua (cherry picked from commit 9ff4953954dddf9985698869cbe9ff5d00857210) (cherry picked from commit 608a73ee68e6036188b3d2087fddfe8209f50260) (cherry picked from commit 5b57acbbda8224a5afc26b90b09c0a24b4dc3129) --- .../tests/unit/virt/libvirt/test_blockinfo.py | 44 +++++++++++++++++++ nova/virt/libvirt/blockinfo.py | 8 ++-- 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/nova/tests/unit/virt/libvirt/test_blockinfo.py b/nova/tests/unit/virt/libvirt/test_blockinfo.py index 5a0dbb40ce3..4e245093af6 100644 --- a/nova/tests/unit/virt/libvirt/test_blockinfo.py +++ b/nova/tests/unit/virt/libvirt/test_blockinfo.py @@ -1289,6 +1289,7 @@ def test_get_root_info_no_bdm_empty_image_meta(self, mock_find_dev): @mock.patch('nova.virt.libvirt.blockinfo.get_info_from_bdm') def test_get_root_info_bdm(self, mock_get_info): + # call get_root_info() with DriverBlockDevice instance = objects.Instance(**self.test_instance) image_meta = objects.ImageMeta.from_dict(self.test_image_meta) root_bdm = {'mount_device': '/dev/vda', @@ -1318,6 +1319,49 @@ def test_get_root_info_bdm(self, mock_get_info): {}, 'virtio') mock_get_info.reset_mock() + @mock.patch('nova.virt.libvirt.blockinfo.get_info_from_bdm') + def test_get_root_info_bdm_with_deepcopy(self, mock_get_info): + # call get_root_info() with BlockDeviceMapping + instance = objects.Instance(**self.test_instance) + image_meta = objects.ImageMeta.from_dict(self.test_image_meta) + root_bdm = objects.BlockDeviceMapping(self.context, + **fake_block_device.FakeDbBlockDeviceDict( + {'id': 3, 'instance_uuid': uuids.instance, + 'device_name': '/dev/sda', + 'source_type': 'blank', + 'destination_type': 'local', + 'device_type': 'cdrom', + 'disk_bus': 'virtio', + 'volume_id': 'fake-volume-id-1', + 'boot_index': 0})) + # No root_device_name + blockinfo.get_root_info( + instance, 'kvm', image_meta, root_bdm, 'virtio', 'ide') + mock_get_info.assert_called_once_with( + instance, 'kvm', image_meta, root_bdm, {}, 'virtio') + mock_get_info.reset_mock() + # Both device names + blockinfo.get_root_info( + instance, 'kvm', image_meta, root_bdm, 'virtio', 'scsi', + root_device_name='/dev/sda') + mock_get_info.assert_called_once_with( + instance, 'kvm', image_meta, root_bdm, {}, 'virtio') + mock_get_info.reset_mock() + # Missing device names + original_bdm = copy.deepcopy(root_bdm) + root_bdm.device_name = '' + blockinfo.get_root_info( + instance, 'kvm', image_meta, root_bdm, 'virtio', 'scsi', + root_device_name='/dev/sda') + mock_get_info.assert_called_with( + instance, 'kvm', image_meta, mock.ANY, {}, 'virtio') + actual_call = mock_get_info.call_args + _, _, _, actual_bdm, _, _ = actual_call[0] + self.assertEqual( + original_bdm.obj_to_primitive(), + actual_bdm.obj_to_primitive() + ) + def test_get_boot_order_simple(self): disk_info = { 'disk_bus': 'virtio', diff --git a/nova/virt/libvirt/blockinfo.py b/nova/virt/libvirt/blockinfo.py index 4efc6fbaeb1..4d03dc38ea2 100644 --- a/nova/virt/libvirt/blockinfo.py +++ b/nova/virt/libvirt/blockinfo.py @@ -69,6 +69,7 @@ """ +import copy import itertools import operator @@ -444,12 +445,13 @@ def get_root_info(instance, virt_type, image_meta, root_bdm, 'dev': block_device.strip_dev(root_device_name), 'boot_index': '1'} + root_bdm_copy = root_bdm if not get_device_name(root_bdm) and root_device_name: - root_bdm = root_bdm.copy() - root_bdm['device_name'] = root_device_name + root_bdm_copy = copy.deepcopy(root_bdm) + root_bdm_copy['device_name'] = root_device_name return get_info_from_bdm( - instance, virt_type, image_meta, root_bdm, {}, disk_bus, + instance, virt_type, image_meta, root_bdm_copy, {}, disk_bus, ) From 75497b0ba61e85afed3f4a0660c20b2d13cb2ae0 Mon Sep 17 00:00:00 2001 From: Elod Illes Date: Tue, 13 May 2025 15:06:27 +0200 Subject: [PATCH 69/73] [tool] Fix backport validator for non-SLURP non-SLURP branches are EOL'd in case they reach their end of maintained phase. This could produce a situation when a patch is merged in a non-SLURP branch that was deleted in the meantime and it's further backports fail on gate with backport validator as the hash of the non-SLURP version of the patch is not on any branch. This patch fixes the above issue as follows: in case a hash is not found on any branch, then it checks if it can be found under any *-eol tag and only fails if there is not found either. Change-Id: I56705bce8ee4354cd5cb1577a520c2d1c525f57b (cherry picked from commit e383b465458969ec9271013f2b9e9f24b8225418) (cherry picked from commit 8b0ae7243f8d581e1e73f0b9dcccf710666d931f) (cherry picked from commit 88e49dd65c58536ba8dd39ab7cfde669a433f3f6) (cherry picked from commit db438e55e62599faf2931d0992a5c7689ade3610) (cherry picked from commit 0fdd21fb4ba4d8c0f5ad45cb8bf1d2698c382c6d) --- tools/check-cherry-picks.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/check-cherry-picks.sh b/tools/check-cherry-picks.sh index 74887a9178b..fe75867e59f 100755 --- a/tools/check-cherry-picks.sh +++ b/tools/check-cherry-picks.sh @@ -26,8 +26,11 @@ branches+="" for hash in $hashes; do branch=$(git branch -a --contains "$hash" 2>/dev/null| grep -oE '(master|stable/[a-z0-9.]+|unmaintained/[a-z0-9.]+)') if [ $? -ne 0 ]; then - echo "Cherry pick hash $hash not on any master, stable or unmaintained branches" - exit 1 + branch=$(git tag --contains "$hash" 2>/dev/null| grep -oE '([0-9.]+-eol)') + if [ $? -ne 0 ]; then + echo "Cherry pick hash $hash not on any master, stable, unmaintained or EOL'd branches" + exit 1 + fi fi branches+=" $branch" checked=$(($checked + 1)) From 422d33ecc1682f8880b08ec1b302a1be88bfb39e Mon Sep 17 00:00:00 2001 From: Takashi Kajinami Date: Mon, 5 Jan 2026 21:19:54 +0900 Subject: [PATCH 70/73] Replace removed os-vif-ovs job It was replaced by os-vif-ovn job. Changes: .zuul.yaml NOTE(elod.illes): the change is because patch in version unmaintained/2024.1 included the remove of grenade job from 2023.2, because it went to End of Life, hence the job couldn't run anymore. So this is basically a partial backport from 2024.1, but technically the same patch as on stable/* branches. Depends-on: https://review.opendev.org/c/openstack/os-vif/+/798038 Change-Id: I4fc595eb51c05c4875bc94e0e812f117a35df7cf Signed-off-by: Takashi Kajinami (cherry picked from commit ad911932ff90a25af1abc8fa4a95b07d03f55705) (cherry picked from commit 1783a30680410ebac553e422b0e287f91d101c9e) (cherry picked from commit ccf3e18af9ed0162553b5b6524e90445b162a6db) (cherry picked from commit 95c43bcf7c1ad852bd77e57232acec9f06b9f34f) (cherry picked from commit d7ca3d90ab97547a1ea21fe953b61d7b421592b7) (cherry picked from commit eaa65f0b85123a4ee3432466a15534b4dd19d3eb) --- .zuul.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.zuul.yaml b/.zuul.yaml index e46eac85f5a..141a37f4cf3 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -744,7 +744,7 @@ irrelevant-files: *nova-base-irrelevant-files - neutron-ovs-tempest-iptables_hybrid: irrelevant-files: *nova-base-irrelevant-files - - os-vif-ovs: + - os-vif-ovn: irrelevant-files: *nova-base-irrelevant-files - devstack-plugin-ceph-compute-local-ephemeral: irrelevant-files: *nova-base-irrelevant-files From ece9853c56b506431a9ae1b25f1268f2fc76719b Mon Sep 17 00:00:00 2001 From: Dan Smith Date: Tue, 17 Feb 2026 06:39:04 -0800 Subject: [PATCH 71/73] Make disk.extend() pass format to qemu-img This fixes an instance of us passing a disk image to qemu-img for resize where we don't constrain the format. As has previously been identified, it is never safe to do that when the image itself is not trusted. In this case, an instance with a previously-raw disk image being used by imagebackend.Flat is susceptible to the user writing a qcow2 (or other) header to their disk causing the unconstrained qemu-img resize operation to interpret it as a qcow2 file. Since Flat maintains the intended disk format in the disk.info file, and since we would have safety-checked images we got from glance, we should be able to trust the image.format specifier, which comes from driver_format in imagebackend, which is read from disk.info. Since only raw or qcow2 files should be resized anyway, we can further constrain it to those. Notes: 1. qemu-img refuses to resize some types of VMDK files, but it may be able to resize others (there are many subformats). Technically, Flat will allow running an instance directly from a VMDK file, and so this change _could_ be limiting existing "unintentionally works" behavior. 2. This assumes that disk.info is correct, present, etc. The code to handle disk.info will regenerate the file if it's missing or unreadable by probing the image without a safety check, which would be unsafe. However, that is a much more sophisticated attack, requiring either access to the system to delete the file or an errant operator action in the first place. Change-Id: I07cbe90b7a7a0a416ef13fbc3a1b7e2272c90951 Closes-Bug: #2137507 (cherry picked from commit 3eba22ff09c81a61750fbb4882e5f1f01a20fdf5) (cherry picked from commit f448173e3c531f3b298ed2f6f02ff9b47981fbc1) (cherry picked from commit 992646e49b4b4d96f3258dc154d6f00a43d18d01) Signed-off-by: Dan Smith (cherry picked from commit 92d5d741e4018435d84d2f0886953031c33c3e4d) (cherry picked from commit 06d1077186e215f15f38ef62d1704ee3379b7fe7) (cherry picked from commit cfef09ac2286fad57ad4e40088091376598c1ea8) --- nova/tests/unit/virt/disk/test_api.py | 35 ++++++++++++++++++++++++--- nova/virt/disk/api.py | 16 +++++++++++- 2 files changed, 46 insertions(+), 5 deletions(-) diff --git a/nova/tests/unit/virt/disk/test_api.py b/nova/tests/unit/virt/disk/test_api.py index 62005de5254..95432e5ad79 100644 --- a/nova/tests/unit/virt/disk/test_api.py +++ b/nova/tests/unit/virt/disk/test_api.py @@ -19,6 +19,7 @@ from oslo_concurrency import processutils from oslo_utils import units +from nova import exception from nova import test from nova.virt.disk import api from nova.virt.disk.mount import api as mount @@ -127,7 +128,7 @@ def test_extend_qcow_success(self, mock_exec, mock_inst, mock_resize, mock_can_resize.assert_called_once_with(imgfile, imgsize) mock_exec.assert_called_once_with('qemu-img', 'resize', - imgfile, imgsize) + '-f', 'qcow2', imgfile, imgsize) mock_extendable.assert_called_once_with(image) mock_inst.assert_called_once_with(image, None, None) mock_resize.assert_called_once_with(mounter.device, @@ -153,8 +154,8 @@ def test_extend_qcow_no_resize(self, mock_execute, mock_extendable, api.extend(image, imgsize) mock_can_resize_image.assert_called_once_with(imgfile, imgsize) - mock_execute.assert_called_once_with('qemu-img', 'resize', imgfile, - imgsize) + mock_execute.assert_called_once_with('qemu-img', 'resize', '-f', + 'qcow2', imgfile, imgsize) self.assertFalse(mock_extendable.called) @mock.patch.object(api, 'can_resize_image', autospec=True, @@ -185,8 +186,34 @@ def test_extend_raw_success(self, mock_exec, mock_resize, api.extend(image, imgsize) mock_exec.assert_has_calls( - [mock.call('qemu-img', 'resize', imgfile, imgsize), + [mock.call('qemu-img', 'resize', '-f', 'raw', imgfile, imgsize), mock.call('e2label', image.path)]) mock_resize.assert_called_once_with(imgfile, run_as_root=False, check_exit_code=[0]) mock_can_resize.assert_called_once_with(imgfile, imgsize) + + @mock.patch.object(api, 'can_resize_image', autospec=True, + return_value=True) + @mock.patch.object(api, 'resize2fs', autospec=True) + @mock.patch('oslo_concurrency.processutils.execute', autospec=True) + def test_extend_vmdk_failure(self, mock_exec, mock_resize, + mock_can_resize): + + imgfile = tempfile.NamedTemporaryFile() + self.addCleanup(imgfile.close) + imgsize = 10 + # NOTE(danms): There is no image.model.FORMAT_VMDK, but since the + # code initializes this directly from Image.disk_format without using + # the constant (tsk), this can actually happen at runtime. + self.assertRaises(exception.InvalidImageFormat, + imgmodel.LocalFileImage, imgfile, 'vmdk') + + # Patch ALL_FORMATS to include vmdk as if it got added at some point + with mock.patch('nova.virt.image.model.ALL_FORMATS', + new=['vmdk']): + image = imgmodel.LocalFileImage(imgfile, 'vmdk') + + # Make sure that we still don't call qemu-img resize on the image + self.assertRaises(exception.InvalidDiskFormat, + api.extend, image, imgsize) + mock_exec.assert_not_called() diff --git a/nova/virt/disk/api.py b/nova/virt/disk/api.py index 9902c0608ba..580e4daf1f1 100644 --- a/nova/virt/disk/api.py +++ b/nova/virt/disk/api.py @@ -125,7 +125,21 @@ def extend(image, size): nova.privsep.libvirt.ploop_resize(image.path, size) return - processutils.execute('qemu-img', 'resize', image.path, size) + # NOTE(danms): We should not call qemu-img without a format, and + # only qcow2 and raw are supported. So check which one we're being + # told this is supposed to be and pass that to qemu-img. Also note + # that we need to pass the qemu format string to this command, which + # may or may not be the same as the FORMAT_* constant, so be + # explicit here. + if image.format == imgmodel.FORMAT_RAW: + format = 'raw' + elif image.format == imgmodel.FORMAT_QCOW2: + format = 'qcow2' + else: + LOG.warning('Attempting to resize image %s with format %s, ' + 'which is not supported', image.path, image.format) + raise exception.InvalidDiskFormat(disk_format=image.format) + processutils.execute('qemu-img', 'resize', '-f', format, image.path, size) if (image.format != imgmodel.FORMAT_RAW and not CONF.resize_fs_using_block_device): From 876607942f7fa69a40460089b4fc503fcd92ea7f Mon Sep 17 00:00:00 2001 From: Elod Illes Date: Wed, 18 Feb 2026 13:35:27 +0100 Subject: [PATCH 72/73] [CI][stable-only] Workaround for missing pkg_resources With recent virtualenv release (that bundles setuptools in it) tox jobs started to fail with 'missing pkg_resources module' errors. Since this is an old branch, where we have only python version <= 3.11 in gate jobs, it is enough to cap virtualenv to fix the gate. Conflicts: tox.ini Change-Id: If10ce13899edaf6dc1d5798aee17842e54939bc7 Signed-off-by: Elod Illes (cherry picked from commit dd8c0613935834e312834cab7bf451736ed19fb6) (cherry picked from commit 7530949b1bbdabbd459ae1d970bdcf95e3b19ba5) --- tox.ini | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tox.ini b/tox.ini index 0f798c2fde4..1df42f3631f 100644 --- a/tox.ini +++ b/tox.ini @@ -5,6 +5,12 @@ envlist = py39,functional,pep8 # env and ignore basepython inherited from [testenv] if we set # ignore_basepython_conflict. ignore_basepython_conflict = True +# NOTE(elod.illes): latest virtualenv bundles setuptools 82.0.0, which +# dropped pkg_resources module source from the package, which is used +# in nova and other packages, thus virtualenv needs to be pinned to fix +# the gate. +requires = + virtualenv<20.37.0 [testenv] basepython = python3 From a8c53ef4c33e355b7e15251aeb862a42a434b8bf Mon Sep 17 00:00:00 2001 From: melanie witt Date: Wed, 16 Apr 2025 15:20:23 -0700 Subject: [PATCH 73/73] libvirt: Use common naming convention for ephemeral disk labels The _create_ephemeral() method is responsible for creating ephemeral disks with image type "raw" and formatting them with mkfs. In the case of [libvirt]images_type "qcow2", _create_ephemeral() will create backing files. Currently we are not using a consistent naming convention for choosing the filesystem label for ephemeral disks. When we create a server for example, we go through the disks and label them "ephemeral0", "ephemeral1", "ephemeral2", etc. When we hard reboot a server, there is a check to create missing backing files and if so, a new backing file will be created but instead of being labeled "ephemeralN" the code attempts to label them with the name of the backing file itself for example "ephemeral_1_40d1d2c". This will fail if the filesystem used for ephemeral disks has limitations on the length of filesystem label names (VFAT, XFS, ...). For example: mkfs.vfat: Label can be no longer than 11 characters This adds a helper method for obtaining ephemeral disks filesystem label names and uses it the same way in the few places fs_label is specified. Closes-Bug: #2061701 Change-Id: Id033a5760272e4fb06dee2342414b26aa16ffe24 (cherry picked from commit 82856f95c69bb07bd2a61decae9abe827a2a1567) (cherry picked from commit 09fc2fae424493ff9580c6d38e63f207b916529c) (cherry picked from commit 2fd65bd14ac33b58e247bf9d3c8066fa5dac2215) Signed-off-by: Pierre Riteau (cherry picked from commit d6cdd73c980b31080710f9a8cf10d7c779780c43) (cherry picked from commit 911cc31b8cbbee9088a5c7b30ed34f3f2a327b3d) --- nova/tests/unit/virt/libvirt/test_driver.py | 5 ++++- nova/virt/libvirt/driver.py | 13 ++++++++++--- ...01-ephemeral-disk-fs-label-504484c4522e6d6a.yaml | 6 ++++++ 3 files changed, 20 insertions(+), 4 deletions(-) create mode 100644 releasenotes/notes/bug-2061701-ephemeral-disk-fs-label-504484c4522e6d6a.yaml diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index 47612a6db8c..5e448c0b461 100644 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -14312,8 +14312,11 @@ def test_create_images_and_backing_ephemeral_gets_created( 'ephemeral_foo') ] + # This also asserts that the filesystem label name is generated + # correctly as 'ephemeral0' to help prevent regression of the + # related bug fix from https://launchpad.net/bugs/2061701 create_ephemeral_mock.assert_called_once_with( - ephemeral_size=1, fs_label='ephemeral_foo', + ephemeral_size=1, fs_label='ephemeral0', os_type='linux', target=ephemeral_backing) fetch_image_mock.assert_called_once_with( diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index b88103862d6..3e25198f3ba 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -4635,6 +4635,13 @@ def _inject_data(self, disk, instance, injection_info): {'img_id': img_id, 'e': e}, instance=instance) + @staticmethod + def _get_fs_label_ephemeral(index: int) -> str: + # Use a consistent naming convention for FS labels. We need to be + # mindful of various filesystems label name length limitations. + # See for example: https://bugs.launchpad.net/nova/+bug/2061701 + return f'ephemeral{index}' + # NOTE(sileht): many callers of this method assume that this # method doesn't fail if an image already exists but instead # think that it will be reused (ie: (live)-migration/resize) @@ -4750,7 +4757,7 @@ def raw(fname, disk_info_mapping=None): created_disks = created_disks or not disk_image.exists() fn = functools.partial(self._create_ephemeral, - fs_label='ephemeral0', + fs_label=self._get_fs_label_ephemeral(0), os_type=instance.os_type, is_block_dev=disk_image.is_block_dev, vm_mode=vm_mode) @@ -4776,7 +4783,7 @@ def raw(fname, disk_info_mapping=None): raise exception.InvalidBDMFormat(details=msg) fn = functools.partial(self._create_ephemeral, - fs_label='ephemeral%d' % idx, + fs_label=self._get_fs_label_ephemeral(idx), os_type=instance.os_type, is_block_dev=disk_image.is_block_dev, vm_mode=vm_mode) @@ -10980,7 +10987,7 @@ def _create_images_and_backing(self, context, instance, instance_dir, # cached. disk.cache( fetch_func=self._create_ephemeral, - fs_label=cache_name, + fs_label=self._get_fs_label_ephemeral(0), os_type=instance.os_type, filename=cache_name, size=info['virt_disk_size'], diff --git a/releasenotes/notes/bug-2061701-ephemeral-disk-fs-label-504484c4522e6d6a.yaml b/releasenotes/notes/bug-2061701-ephemeral-disk-fs-label-504484c4522e6d6a.yaml new file mode 100644 index 00000000000..5f4c22ca248 --- /dev/null +++ b/releasenotes/notes/bug-2061701-ephemeral-disk-fs-label-504484c4522e6d6a.yaml @@ -0,0 +1,6 @@ +fixes: + - | + Fixed an issue where certain server actions could fail for servers with + ephemeral disks due to filesystem label name length limitations + (VFAT, XFS, ...). Filesystem label name generation has been fixed for these + cases. See https://launchpad.net/bugs/2061701 for more details.