Message ID | 20250102185831.4324-1-farosas@suse.de (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | migration: Add more error handling to analyze-migration.py | expand |
On Thu, Jan 02, 2025 at 03:58:31PM -0300, Fabiano Rosas wrote: > The analyze-migration script was seen failing in s390x in misterious > ways. It seems we're reaching the subsection constructor without any It might be a good idea to add some debug lines indeed. Though are you sure it's from parsing a subsection? https://lore.kernel.org/all/20241219123213.GA692742@fedora/ ====8<==== stderr: Traceback (most recent call last): File "/home/gitlab-runner/builds/4S3awx_3/0/qemu-project/qemu/build/scripts/analyze-migration.py", line 688, in <module> dump.read(dump_memory = args.memory) File "/home/gitlab-runner/builds/4S3awx_3/0/qemu-project/qemu/build/scripts/analyze-migration.py", line 625, in read section.read() File "/home/gitlab-runner/builds/4S3awx_3/0/qemu-project/qemu/build/scripts/analyze-migration.py", line 461, in read field['data'] = reader(field, self.file) File "/home/gitlab-runner/builds/4S3awx_3/0/qemu-project/qemu/build/scripts/analyze-migration.py", line 434, in __init__ for field in self.desc['struct']['fields']: KeyError: 'fields' ====8<==== It reads to me that it's not in "if 'subsections' in self.desc['struct']" loop yet, instead it looks like a real STRUCT field in one of the device sections? If that's true, then... > fields, which would indicate an empty .subsection entry in the vmstate > definition. We don't have any of those, at least not without the > unmigratable flag set, so this should never happen. > > Add some debug statements so that we can see what's going on the next > time the issue happens. > > Signed-off-by: Fabiano Rosas <farosas@suse.de> > --- > scripts/analyze-migration.py | 33 +++++++++++++++++++++++++++------ > 1 file changed, 27 insertions(+), 6 deletions(-) > > diff --git a/scripts/analyze-migration.py b/scripts/analyze-migration.py > index 8a254a5b6a..1dd98f1d5a 100755 > --- a/scripts/analyze-migration.py > +++ b/scripts/analyze-migration.py > @@ -429,6 +429,10 @@ def __init__(self, desc, file): > super(VMSDFieldStruct, self).__init__(desc, file) > self.data = collections.OrderedDict() > > + if 'fields' not in self.desc['struct']: > + raise Exception("No fields in subsection key=%s name=%s" % > + (self.section_key, self.vmsd_name)) ... this self.section_key / self.vmsd_name may not exist.. In all cases, IMHO this would be better the debug lines work with both pure structs and sections. > + > # When we see compressed array elements, unfold them here > new_fields = [] > for field in self.desc['struct']['fields']: > @@ -477,6 +481,11 @@ def read(self): > raise Exception("Subsection %s not found at offset %x" % ( subsection['vmsd_name'], self.file.tell())) > name = self.file.readstr() > version_id = self.file.read32() > + > + if not subsection: > + raise Exception("Empty description for subsection %s" % > + name) This is checking subsection==None?? I doubt whether this will hit.. > + > self.data[name] = VMSDSection(self.file, version_id, subsection, (name, 0)) > self.data[name].read() > > @@ -575,9 +584,8 @@ def __init__(self, filename): > self.filename = filename > self.vmsd_desc = None > > - def read(self, desc_only = False, dump_memory = False, write_memory = False): > - # Read in the whole file > - file = MigrationFile(self.filename) > + def _read(self, file, vmsd_json, desc_only = False, dump_memory = False, > + write_memory = False): > > # File magic > data = file.read32() > @@ -589,7 +597,7 @@ def read(self, desc_only = False, dump_memory = False, write_memory = False): > if data != self.QEMU_VM_FILE_VERSION: > raise Exception("Invalid version number %d" % data) > > - self.load_vmsd_json(file) > + self.load_vmsd_json(file, vmsd_json) > > # Read sections > self.sections = collections.OrderedDict() > @@ -632,12 +640,25 @@ def read(self, desc_only = False, dump_memory = False, write_memory = False): > raise Exception("Mismatched section footer: %x vs %x" % (read_section_id, section_id)) > else: > raise Exception("Unknown section type: %d" % section_type) > - file.close() > > - def load_vmsd_json(self, file): > + def read(self, desc_only = False, dump_memory = False, > + write_memory = False): > + file = MigrationFile(self.filename) > vmsd_json = file.read_migration_debug_json() > + > + try: > + self._read(file, vmsd_json, desc_only, dump_memory, write_memory) > + except: > + raise Exception("Full JSON dump:\n%s", vmsd_json) Better dump the Exception line itself too? IIUC that needs things like: except Exception as e: raise Exception(XXX, e, vmsd_json) More below.. > + finally: > + file.close() > + > + def load_vmsd_json(self, file, vmsd_json): > self.vmsd_desc = json.loads(vmsd_json, object_pairs_hook=collections.OrderedDict) > for device in self.vmsd_desc['devices']: > + if 'fields' not in device: > + raise Exception("vmstate for device %s has no fields" % > + device['name']) This looks a valid check, though I still doubt whether this would hit at all for this specific error (which looks like a top level STRUCT of a section, that is missing "fields"). > key = (device['name'], device['instance_id']) > value = ( VMSDSection, device ) > self.section_classes[key] = value > -- > 2.35.3 > For the capture part, would it be easier if we trap at the very top level once and for all, then try to dump the vmsd_desc as long as existed? Like this: ====8<==== @@ -685,9 +686,15 @@ def default(self, o): f.close() elif args.dump == "state": dump = MigrationDump(args.file) - dump.read(dump_memory = args.memory) - dict = dump.getDict() - print(jsonenc.encode(dict)) + try: + dump.read(dump_memory = args.memory) + dict = dump.getDict() + print(jsonenc.encode(dict)) + except Exception as e: + # If loading vmstate stream failed, try the best to dump vmsd desc + raise Exception( + f"Caught exception when reading dump: {e}\n" + f"Trying to dump vmsd_desc: {jsonenc.encode(dump.vmsd_desc)}") elif args.dump == "desc": dump = MigrationDump(args.file) dump.read(desc_only = True) ====8<==== So no matter what caused error, fallback to try dump vmsd_desc as long as available. Thanks,
Peter Xu <peterx@redhat.com> writes: > On Thu, Jan 02, 2025 at 03:58:31PM -0300, Fabiano Rosas wrote: >> The analyze-migration script was seen failing in s390x in misterious >> ways. It seems we're reaching the subsection constructor without any > > It might be a good idea to add some debug lines indeed. Though are you sure > it's from parsing a subsection? Ah, indeed it's not always a subsection. So I need to go back to the code and do more auditing, we might have a wrong struct macro somewhere. > > https://lore.kernel.org/all/20241219123213.GA692742@fedora/ > > ====8<==== > stderr: > Traceback (most recent call last): > File "/home/gitlab-runner/builds/4S3awx_3/0/qemu-project/qemu/build/scripts/analyze-migration.py", line 688, in <module> > dump.read(dump_memory = args.memory) > File "/home/gitlab-runner/builds/4S3awx_3/0/qemu-project/qemu/build/scripts/analyze-migration.py", line 625, in read > section.read() > File "/home/gitlab-runner/builds/4S3awx_3/0/qemu-project/qemu/build/scripts/analyze-migration.py", line 461, in read > field['data'] = reader(field, self.file) > File "/home/gitlab-runner/builds/4S3awx_3/0/qemu-project/qemu/build/scripts/analyze-migration.py", line 434, in __init__ > for field in self.desc['struct']['fields']: > KeyError: 'fields' > ====8<==== > > It reads to me that it's not in "if 'subsections' in self.desc['struct']" > loop yet, instead it looks like a real STRUCT field in one of the device > sections? If that's true, then... > >> fields, which would indicate an empty .subsection entry in the vmstate >> definition. We don't have any of those, at least not without the >> unmigratable flag set, so this should never happen. >> >> Add some debug statements so that we can see what's going on the next >> time the issue happens. >> >> Signed-off-by: Fabiano Rosas <farosas@suse.de> >> --- >> scripts/analyze-migration.py | 33 +++++++++++++++++++++++++++------ >> 1 file changed, 27 insertions(+), 6 deletions(-) >> >> diff --git a/scripts/analyze-migration.py b/scripts/analyze-migration.py >> index 8a254a5b6a..1dd98f1d5a 100755 >> --- a/scripts/analyze-migration.py >> +++ b/scripts/analyze-migration.py >> @@ -429,6 +429,10 @@ def __init__(self, desc, file): >> super(VMSDFieldStruct, self).__init__(desc, file) >> self.data = collections.OrderedDict() >> >> + if 'fields' not in self.desc['struct']: >> + raise Exception("No fields in subsection key=%s name=%s" % >> + (self.section_key, self.vmsd_name)) > > ... this self.section_key / self.vmsd_name may not exist.. Right, so there's no information we can get at this point I think. But raising will still trigger the dump later on. > > In all cases, IMHO this would be better the debug lines work with both pure > structs and sections. Yeah, I need to change that to something generic. > >> + >> # When we see compressed array elements, unfold them here >> new_fields = [] >> for field in self.desc['struct']['fields']: >> @@ -477,6 +481,11 @@ def read(self): >> raise Exception("Subsection %s not found at offset %x" % ( subsection['vmsd_name'], self.file.tell())) >> name = self.file.readstr() >> version_id = self.file.read32() >> + >> + if not subsection: >> + raise Exception("Empty description for subsection %s" % >> + name) > > This is checking subsection==None?? I doubt whether this will hit.. Well, there could be a None in the list of subsections somehow, no? > >> + >> self.data[name] = VMSDSection(self.file, version_id, subsection, (name, 0)) >> self.data[name].read() >> >> @@ -575,9 +584,8 @@ def __init__(self, filename): >> self.filename = filename >> self.vmsd_desc = None >> >> - def read(self, desc_only = False, dump_memory = False, write_memory = False): >> - # Read in the whole file >> - file = MigrationFile(self.filename) >> + def _read(self, file, vmsd_json, desc_only = False, dump_memory = False, >> + write_memory = False): >> >> # File magic >> data = file.read32() >> @@ -589,7 +597,7 @@ def read(self, desc_only = False, dump_memory = False, write_memory = False): >> if data != self.QEMU_VM_FILE_VERSION: >> raise Exception("Invalid version number %d" % data) >> >> - self.load_vmsd_json(file) >> + self.load_vmsd_json(file, vmsd_json) >> >> # Read sections >> self.sections = collections.OrderedDict() >> @@ -632,12 +640,25 @@ def read(self, desc_only = False, dump_memory = False, write_memory = False): >> raise Exception("Mismatched section footer: %x vs %x" % (read_section_id, section_id)) >> else: >> raise Exception("Unknown section type: %d" % section_type) >> - file.close() >> >> - def load_vmsd_json(self, file): >> + def read(self, desc_only = False, dump_memory = False, >> + write_memory = False): >> + file = MigrationFile(self.filename) >> vmsd_json = file.read_migration_debug_json() >> + >> + try: >> + self._read(file, vmsd_json, desc_only, dump_memory, write_memory) >> + except: >> + raise Exception("Full JSON dump:\n%s", vmsd_json) > > Better dump the Exception line itself too? IIUC that needs things like: > > except Exception as e: > raise Exception(XXX, e, vmsd_json) It already shows both exceptions in the form: Traceback (most recent call last): <backtrace> Exception: <msg> During handling of the above exception, another exception occurred: Traceback (most recent call last): <backtrace> Exception: ('Full JSON dump:\n%s', '{"configuration": {"vmsd_name" ... > > More below.. > >> + finally: >> + file.close() >> + >> + def load_vmsd_json(self, file, vmsd_json): >> self.vmsd_desc = json.loads(vmsd_json, object_pairs_hook=collections.OrderedDict) >> for device in self.vmsd_desc['devices']: >> + if 'fields' not in device: >> + raise Exception("vmstate for device %s has no fields" % >> + device['name']) > > This looks a valid check, though I still doubt whether this would hit at > all for this specific error (which looks like a top level STRUCT of a > section, that is missing "fields"). Ok, but it's a (VMSDSection, device) tuple below. That "device" might end up in the constructor of the FieldStruct: MigrationDump.load_vmsd_json: ... value = ( VMSDSection, device ) self.section_classes[key] = value ... MigrationDump.read: ... classdesc = self.section_classes[section_key] section = classdesc[0](file, version_id, classdesc[1], section_key) class VMSDSection(VMSDFieldStruct): def __init__(self, file, version_id, device, section_key): ... # A section really is nothing but a FieldStruct :) super(VMSDSection, self).__init__({ 'struct' : desc }, file) >> key = (device['name'], device['instance_id']) >> value = ( VMSDSection, device ) >> self.section_classes[key] = value >> -- >> 2.35.3 >> > > For the capture part, would it be easier if we trap at the very top level > once and for all, then try to dump the vmsd_desc as long as existed? Like > this: > > ====8<==== > @@ -685,9 +686,15 @@ def default(self, o): > f.close() > elif args.dump == "state": > dump = MigrationDump(args.file) > - dump.read(dump_memory = args.memory) > - dict = dump.getDict() > - print(jsonenc.encode(dict)) > + try: > + dump.read(dump_memory = args.memory) This is not the only dump.read() call in the file. Ideally we'd wrap them all. But then there's a slight clash with the parameter validation part (args.dump == ...) because it raises if no valid option is used. I'll try to improve this a bit. I was just trying to get this thing to spit something useful so we can unblock the migration PR. > + dict = dump.getDict() > + print(jsonenc.encode(dict)) > + except Exception as e: > + # If loading vmstate stream failed, try the best to dump vmsd desc > + raise Exception( > + f"Caught exception when reading dump: {e}\n" > + f"Trying to dump vmsd_desc: {jsonenc.encode(dump.vmsd_desc)}") I wanted to avoid doing any processing after the error because there's the chance there's Python issue during decoding. But I can maybe store the string instead at load_vmsd_json. > elif args.dump == "desc": > dump = MigrationDump(args.file) > dump.read(desc_only = True) > ====8<==== > > So no matter what caused error, fallback to try dump vmsd_desc as long as > available. > > Thanks,
diff --git a/scripts/analyze-migration.py b/scripts/analyze-migration.py index 8a254a5b6a..1dd98f1d5a 100755 --- a/scripts/analyze-migration.py +++ b/scripts/analyze-migration.py @@ -429,6 +429,10 @@ def __init__(self, desc, file): super(VMSDFieldStruct, self).__init__(desc, file) self.data = collections.OrderedDict() + if 'fields' not in self.desc['struct']: + raise Exception("No fields in subsection key=%s name=%s" % + (self.section_key, self.vmsd_name)) + # When we see compressed array elements, unfold them here new_fields = [] for field in self.desc['struct']['fields']: @@ -477,6 +481,11 @@ def read(self): raise Exception("Subsection %s not found at offset %x" % ( subsection['vmsd_name'], self.file.tell())) name = self.file.readstr() version_id = self.file.read32() + + if not subsection: + raise Exception("Empty description for subsection %s" % + name) + self.data[name] = VMSDSection(self.file, version_id, subsection, (name, 0)) self.data[name].read() @@ -575,9 +584,8 @@ def __init__(self, filename): self.filename = filename self.vmsd_desc = None - def read(self, desc_only = False, dump_memory = False, write_memory = False): - # Read in the whole file - file = MigrationFile(self.filename) + def _read(self, file, vmsd_json, desc_only = False, dump_memory = False, + write_memory = False): # File magic data = file.read32() @@ -589,7 +597,7 @@ def read(self, desc_only = False, dump_memory = False, write_memory = False): if data != self.QEMU_VM_FILE_VERSION: raise Exception("Invalid version number %d" % data) - self.load_vmsd_json(file) + self.load_vmsd_json(file, vmsd_json) # Read sections self.sections = collections.OrderedDict() @@ -632,12 +640,25 @@ def read(self, desc_only = False, dump_memory = False, write_memory = False): raise Exception("Mismatched section footer: %x vs %x" % (read_section_id, section_id)) else: raise Exception("Unknown section type: %d" % section_type) - file.close() - def load_vmsd_json(self, file): + def read(self, desc_only = False, dump_memory = False, + write_memory = False): + file = MigrationFile(self.filename) vmsd_json = file.read_migration_debug_json() + + try: + self._read(file, vmsd_json, desc_only, dump_memory, write_memory) + except: + raise Exception("Full JSON dump:\n%s", vmsd_json) + finally: + file.close() + + def load_vmsd_json(self, file, vmsd_json): self.vmsd_desc = json.loads(vmsd_json, object_pairs_hook=collections.OrderedDict) for device in self.vmsd_desc['devices']: + if 'fields' not in device: + raise Exception("vmstate for device %s has no fields" % + device['name']) key = (device['name'], device['instance_id']) value = ( VMSDSection, device ) self.section_classes[key] = value
The analyze-migration script was seen failing in s390x in misterious ways. It seems we're reaching the subsection constructor without any fields, which would indicate an empty .subsection entry in the vmstate definition. We don't have any of those, at least not without the unmigratable flag set, so this should never happen. Add some debug statements so that we can see what's going on the next time the issue happens. Signed-off-by: Fabiano Rosas <farosas@suse.de> --- scripts/analyze-migration.py | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-)