From 51dc5ce6c53f83c825a4eec9a1698a4dcb99e50f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gr=C3=B6ne=2C=20Tjark=20Leon=20Raphael?= <tjark.leon.raphael.groene@uni-hamburg.de> Date: Wed, 18 Jun 2025 16:34:37 +0200 Subject: [PATCH] Update file maxwell_integrate_to_h5.py --- maxwell_integrate_to_h5.py | 227 ++++++++++++++++++------------------- 1 file changed, 113 insertions(+), 114 deletions(-) diff --git a/maxwell_integrate_to_h5.py b/maxwell_integrate_to_h5.py index 0897401..0feac4c 100644 --- a/maxwell_integrate_to_h5.py +++ b/maxwell_integrate_to_h5.py @@ -181,7 +181,7 @@ def integrate_ims_in_dir(path_im, path_int, dtype_im=".tif", dtype_int=".dat"): # Use a lambda function to pass the subdir_path_int to the integration_thread # Map async allows us to run the integration in parallel, with trace back of varibles async_result = pool.map_async( - lambda fname_im: integration_thread(fname_im, subdir_path_int), + lambda fname_im: integration_thread(fname_im), filtered_fnames ) @@ -261,121 +261,120 @@ def integrate_ims_in_dir(path_im, path_int, dtype_im=".tif", dtype_int=".dat"): for idx, result in enumerate(results_data, start=1): # Drop unfinished scans (usually last scan due to closed shutter) - if not result["q"].size or not result["I"].size: - print(f"Skipping invalid scan data for entry {entry_name}") + try: + # Here one could use the image sequence number from the metadata, however, we use the index as it seemes cleaner + entry_name = f"{idx:05d}.1" + + # Create a new entry for each scan + entry = h5.create_group(entry_name) + entry["title"] = "Collected Q-I scans" + entry.attrs["NX_class"] = "NXentry" + + # Set time attributes for the entry + entry.create_dataset("time", data=results_metadata[idx-1]["dateString"].encode('utf-8'), dtype=h5py.string_dtype(encoding='utf-8')) + + # We log the image sequence number if it is available in the metadata + image_sequence_number = results_metadata[idx-1].get("imageSequenceNumber", "").strip() + if image_sequence_number.isdigit(): + entry.create_dataset("image sequence number", data=np.asarray([int(image_sequence_number)], dtype=np.int32), dtype="i4", compression_opts=4, compression="gzip") + + # User comments can be added to the entry if available + # We check if any of the user comments are available, if so, we create a comments group + if any(results_metadata[idx-1][key] for key in ["userComment1", "userComment2", "userComment3", "userComment4"]): + comments = entry.create_group("comments") + comments.attrs["NX_class"] = "NXcomments" + if results_metadata[idx-1]["userComment1"]: + comments.create_dataset("userComment1", data=results_metadata[idx-1]["userComment1"].encode('utf-8'), dtype=h5py.string_dtype(encoding='utf-8'),compression_opts=4, compression="gzip") + if results_metadata[idx-1]["userComment2"]: + comments.create_dataset("userComment2", data=results_metadata[idx-1]["userComment2"].encode('utf-8'), dtype=h5py.string_dtype(encoding='utf-8'),compression_opts=4, compression="gzip") + if results_metadata[idx-1]["userComment3"]: + comments.create_dataset("userComment3", data=results_metadata[idx-1]["userComment3"].encode('utf-8'), dtype=h5py.string_dtype(encoding='utf-8'),compression_opts=4, compression="gzip") + if results_metadata[idx-1]["userComment4"]: + comments.create_dataset("userComment4", data=results_metadata[idx-1]["userComment4"].encode('utf-8'), dtype=h5py.string_dtype(encoding='utf-8'),compression_opts=4, compression="gzip") + + + # Instrument / Detector group (holds all detector data) + detector = entry.create_group("instrument/detector") + detector.attrs["NX_class"] = "NXdetector" + # Compress the data to save space, chunks are used to allow for efficient reading + # Larger chunk sizes increase compression but may slow down reading + # 256 is a common chunk size, (512 is also a good choice for larger datasets), over 1024 may lead to memory issues + chunk_size = 512 + + # Create datasets for q, I, and dI with compression + # We use np.asarray to ensure the data is in the correct format + # and dtype is set to float64 for better precision + detector.create_dataset("q [Å^-1]", data=np.asarray(result["q"], dtype=np.float64), chunks=(chunk_size,), dtype="f8", compression_opts=4, compression="gzip") + detector.create_dataset("I", data=np.asarray(result["I"], dtype=np.float64), chunks=(chunk_size,), dtype="f8", compression_opts=4, compression="gzip") + detector.create_dataset("dI", data=np.asarray(result["dI"], dtype=np.float64), chunks=(chunk_size,), dtype="f8", compression_opts=4, compression="gzip") + + # Handle missing or invalid metadata values with defaults + width = results_metadata[idx-1].get("width", "").strip() + height = results_metadata[idx-1].get("height", "").strip() + exposure_time = results_metadata[idx-1].get("exposureTime", "").strip() + summed_exposures = results_metadata[idx-1].get("summedExposures", "").strip() + + # Create detector size dataset if width and height are valid integers + # We check if the width and height are digits (i.e., valid integers) + if width.isdigit() and height.isdigit(): + det_size = detector.create_group("detector size") + det_size.attrs["NX_class"] = "NXcollection" + det_size.create_dataset("detector width [pixel]", data=np.asarray([int(width)], dtype=np.int32), dtype="i4", compression_opts=4, compression="gzip") + det_size.create_dataset("detector height [pixel]", data=np.asarray([int(height)], dtype=np.int32), dtype="i4", compression_opts=4, compression="gzip") + + # Also we trac exposure time and summed exposures if they are valid + if exposure_time.isdigit(): + detector.create_dataset("exposure time [s]", data=np.asarray([float(exposure_time)], dtype=np.float32), dtype="f4", compression_opts=4, compression="gzip") + if summed_exposures.replace('.', '', 1).isdigit(): + detector.create_dataset("summed exposures", data=np.asarray([int(summed_exposures)], dtype=np.int32), dtype="i4", compression_opts=4, compression="gzip") + + # Add interpretation info (optional for PyMca) + detector["I"].attrs["interpretation"] = "spectrum" + + # Measurement group (holds soft links) + meas = entry.create_group("measurement") + meas.attrs["NX_class"] = "NXdata" + meas.attrs["signal"] = "I" + meas.attrs["axes"] = "q" + meas.attrs["filename"] = result["filename"] + + # Create soft links to the detector datasets + # We use soft links to the detector datasets to allow for easy access + # This is useful for PyMca and other tools that expect these links + meas["I"] = h5py.SoftLink(f"/{entry_name}/instrument/detector/I") + meas["q [Å^-1]"] = h5py.SoftLink(f"/{entry_name}/instrument/detector/q [Å^-1]") + meas["dI"] = h5py.SoftLink(f"/{entry_name}/instrument/detector/dI") + + # Optional display-friendly names + meas["I"].attrs["long_name"] = "Intensity" + meas["q [Å^-1]"].attrs["long_name"] = "Q [1/Å]" + + # Measurement group (holds soft links) + # We create a plotselect group to allow for easy plotting in h5Web or PyMca + # This group will contain soft links to the datasets in the measurement group + plotselect = entry.create_group("plotselect") + plotselect.attrs["NX_class"] = "NXdata" + plotselect.attrs["signal"] = "I" + plotselect.attrs["axes"] = "q" + + plotselect["I"] = h5py.SoftLink(f"/{entry_name}/instrument/detector/I") + plotselect["q [Å^-1]"] = h5py.SoftLink(f"/{entry_name}/instrument/detector/q [Å^-1]") + + # Optional display-friendly names + plotselect["I"].attrs["long_name"] = "Intensity" + plotselect["q [Å^-1]"].attrs["long_name"] = "Q [1/Å]" + + # For PyMca auto-plot: + entry.attrs["default"] = "plotselect" + + # Optional global default plot group + if idx == len(results_data): # mark the last one as global default + entry["last_plot"] = h5py.SoftLink(f"/{subdir_name}/{entry_name}/measurement") + + except Exception as e: + print(f"Error processing file {result['filename']}: {e}") continue - # Here one could use the image sequence number from the metadata, however, we use the index as it seemes cleaner - entry_name = f"{idx:05d}.1" - - # Create a new entry for each scan - entry = h5.create_group(entry_name) - entry["title"] = "Collected Q-I scans" - entry.attrs["NX_class"] = "NXentry" - - - - # Set time attributes for the entry - entry.create_dataset("time", data=results_metadata[idx-1]["dateString"].encode('utf-8'), dtype=h5py.string_dtype(encoding='utf-8')) - - # We log the image sequence number if it is available in the metadata - image_sequence_number = results_metadata[idx-1].get("imageSequenceNumber", "").strip() - if image_sequence_number.isdigit(): - entry.create_dataset("image sequence number", data=np.asarray([int(image_sequence_number)], dtype=np.int32), dtype="i4", compression_opts=4, compression="gzip") - - # User comments can be added to the entry if available - # We check if any of the user comments are available, if so, we create a comments group - if any(results_metadata[idx-1][key] for key in ["userComment1", "userComment2", "userComment3", "userComment4"]): - comments = entry.create_group("comments") - comments.attrs["NX_class"] = "NXcomments" - if results_metadata[idx-1]["userComment1"]: - comments.create_dataset("userComment1", data=results_metadata[idx-1]["userComment1"].encode('utf-8'), dtype=h5py.string_dtype(encoding='utf-8'),compression_opts=4, compression="gzip") - if results_metadata[idx-1]["userComment2"]: - comments.create_dataset("userComment2", data=results_metadata[idx-1]["userComment2"].encode('utf-8'), dtype=h5py.string_dtype(encoding='utf-8'),compression_opts=4, compression="gzip") - if results_metadata[idx-1]["userComment3"]: - comments.create_dataset("userComment3", data=results_metadata[idx-1]["userComment3"].encode('utf-8'), dtype=h5py.string_dtype(encoding='utf-8'),compression_opts=4, compression="gzip") - if results_metadata[idx-1]["userComment4"]: - comments.create_dataset("userComment4", data=results_metadata[idx-1]["userComment4"].encode('utf-8'), dtype=h5py.string_dtype(encoding='utf-8'),compression_opts=4, compression="gzip") - - - # Instrument / Detector group (holds all detector data) - detector = entry.create_group("instrument/detector") - detector.attrs["NX_class"] = "NXdetector" - # Compress the data to save space, chunks are used to allow for efficient reading - # Larger chunk sizes increase compression but may slow down reading - # 256 is a common chunk size, (512 is also a good choice for larger datasets), over 1024 may lead to memory issues - chunk_size = 512 - - # Create datasets for q, I, and dI with compression - # We use np.asarray to ensure the data is in the correct format - # and dtype is set to float64 for better precision - detector.create_dataset("q [Å^-1]", data=np.asarray(result["q"], dtype=np.float64), chunks=(chunk_size,), dtype="f8", compression_opts=4, compression="gzip") - detector.create_dataset("I", data=np.asarray(result["I"], dtype=np.float64), chunks=(chunk_size,), dtype="f8", compression_opts=4, compression="gzip") - detector.create_dataset("dI", data=np.asarray(result["dI"], dtype=np.float64), chunks=(chunk_size,), dtype="f8", compression_opts=4, compression="gzip") - - # Handle missing or invalid metadata values with defaults - width = results_metadata[idx-1].get("width", "").strip() - height = results_metadata[idx-1].get("height", "").strip() - exposure_time = results_metadata[idx-1].get("exposureTime", "").strip() - summed_exposures = results_metadata[idx-1].get("summedExposures", "").strip() - - # Create detector size dataset if width and height are valid integers - # We check if the width and height are digits (i.e., valid integers) - if width.isdigit() and height.isdigit(): - det_size = detector.create_group("detector size") - det_size.attrs["NX_class"] = "NXcollection" - det_size.create_dataset("detector width [pixel]", data=np.asarray([int(width)], dtype=np.int32), dtype="i4", compression_opts=4, compression="gzip") - det_size.create_dataset("detector height [pixel]", data=np.asarray([int(height)], dtype=np.int32), dtype="i4", compression_opts=4, compression="gzip") - - # Also we trac exposure time and summed exposures if they are valid - if exposure_time.isdigit(): - detector.create_dataset("exposure time [s]", data=np.asarray([float(exposure_time)], dtype=np.float32), dtype="f4", compression_opts=4, compression="gzip") - if summed_exposures.replace('.', '', 1).isdigit(): - detector.create_dataset("summed exposures", data=np.asarray([int(summed_exposures)], dtype=np.int32), dtype="i4", compression_opts=4, compression="gzip") - - # Add interpretation info (optional for PyMca) - detector["I"].attrs["interpretation"] = "spectrum" - - # Measurement group (holds soft links) - meas = entry.create_group("measurement") - meas.attrs["NX_class"] = "NXdata" - meas.attrs["signal"] = "I" - meas.attrs["axes"] = "q" - meas.attrs["filename"] = result["filename"] - - # Create soft links to the detector datasets - # We use soft links to the detector datasets to allow for easy access - # This is useful for PyMca and other tools that expect these links - meas["I"] = h5py.SoftLink(f"/{entry_name}/instrument/detector/I") - meas["q [Å^-1]"] = h5py.SoftLink(f"/{entry_name}/instrument/detector/q [Å^-1]") - meas["dI"] = h5py.SoftLink(f"/{entry_name}/instrument/detector/dI") - - # Optional display-friendly names - meas["I"].attrs["long_name"] = "Intensity" - meas["q [Å^-1]"].attrs["long_name"] = "Q [1/Å]" - - # Measurement group (holds soft links) - # We create a plotselect group to allow for easy plotting in h5Web or PyMca - # This group will contain soft links to the datasets in the measurement group - plotselect = entry.create_group("plotselect") - plotselect.attrs["NX_class"] = "NXdata" - plotselect.attrs["signal"] = "I" - plotselect.attrs["axes"] = "q" - - plotselect["I"] = h5py.SoftLink(f"/{entry_name}/instrument/detector/I") - plotselect["q [Å^-1]"] = h5py.SoftLink(f"/{entry_name}/instrument/detector/q [Å^-1]") - - # Optional display-friendly names - plotselect["I"].attrs["long_name"] = "Intensity" - plotselect["q [Å^-1]"].attrs["long_name"] = "Q [1/Å]" - - # For PyMca auto-plot: - entry.attrs["default"] = "plotselect" - - # Optional global default plot group - if idx == len(results_data): # mark the last one as global default - entry["last_plot"] = h5py.SoftLink(f"/{subdir_name}/{entry_name}/measurement") - print(f"✅ HDF5 file '{output_file}' created with {len(results_data)} spectra.") -- GitLab