대회

CZII - CryoET Object Identification #4 Making synthetic data for Baseline YOLO11 Solution

dongsunseng 2025. 1. 28. 21:58
반응형

This is an annotation of code that produces datasets for YOLO solution with additional data(synthetic data)

CZII making datasets for YOLO + synthetic data

1) Install + Import

!pip install zarr opencv-python

import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import zarr
from tqdm import tqdm
import glob, os
import cv2
import shutil

2) Load + Organize data

runs = sorted(glob.glob('/kaggle/input/czii-cryo-et-object-identification/train/overlay/ExperimentRuns/*'))
print(runs)

runs = [os.path.basename(x) for x in runs]

# Processing additional dataset
additional_runs = sorted(glob.glob('/kaggle/input/czii10441/10441/T*'))
print(additional_runs)
additional_runs = [os.path.basename(x) for x in additional_runs]
runs = runs + additional_runs

# Creating mapping dictionaries
i2r_dict = {i: r for i, r in zip(range(len(runs)), runs)}
r2t_dict = {r: i for i, r in zip(range(len(runs)), runs)}
print("Runs:", i2r_dict)
  • runs = sorted(glob.glob('/kaggle/input/czii-cryo-et-object-identification/train/overlay/ExperimentRuns/*'))
    • Collects paths from the base training dataset
    • Uses glob.glob() to get all experiment paths
    • Uses sorted() to arrange the paths
  • runs = [os.path.basename(x) for x in runs]
    • Extracting experiment names from paths
    • Uses os.path.basename() to extract just the experiment names from full paths
    • Processes all paths using list comprehension
  • Processing additional data part
    • Processes paths from additional dataset (czii10441) in the same way
    • Merges additional dataset with existing experiment list
  • Creating mapping dictionaries part
    • i2r_dict: Maps indices to experiment names (index to run)
    • r2t_dict: Maps experiment names to indices (run to index)
    • Used as lookup tables for later data processing or reference

3) Helper function - Normalize function

# Normalize the image to a value between 0 and 255

def convert_to_8bit(x):
    # 1. Calculate percentiles for outlier removal
    lower, upper = np.percentile(x, (0.5, 99.5))
    
    # 2. Remove extreme values (clipping)
    x = np.clip(x, lower, upper)
    
    # 3. Convert to 0-255 range using Min-max normalization
    x = (x - x.min()) / (x.max() - x.min() + 1e-12) * 255
    
    # 4. Convert to 8-bit integer
    return x.round().astype("uint8")
  • Normalizes image data to 8-bit format (0-255 range)
  • Crucial preprocessing step in CryoET image processing
  • Clipping: Reduces the impact of noise and extreme values
  • Min-max normalization:
    • 1e-12: Small value added to prevent division by zero
    • * 255: Scales 0-1 range to 0-255 range

4) Information about labels

p2i_dict = {
    'apo-ferritin': 0,
    'beta-amylase': 1,
    'beta-galactosidase': 2,
    'ribosome': 3,
    'thyroglobulin': 4,
    'virus-like-particle': 5
}

i2p = {v: k for k, v in p2i_dict.items()}

particle_radius = {
    'apo-ferritin': 60,
    'beta-amylase': 65,
    'beta-galactosidase': 90,
    'ribosome': 150,
    'thyroglobulin': 130,
    'virus-like-particle': 135,
}

particle_names = ['apo-ferritin', 'beta-amylase', 'beta-galactosidase', 'ribosome', 'thyroglobulin', 'virus-like-particle']
from scipy.ndimage import gaussian_filter, median_filter

def denoise_tomogram(tomogram, method='gaussian', **kwargs):
    """
    Apply denoising to a tomogram.

    Parameters:
        tomogram (np.ndarray): The input tomogram to denoise.
        method (str): The denoising method ('gaussian' or 'median').
        kwargs: Parameters for the respective method.
    
    Returns:
        np.ndarray: The denoised tomogram.
    """
    if method == 'gaussian':
        return gaussian_filter(tomogram, sigma=kwargs.get('sigma', 1))
    elif method == 'median':
        return median_filter(tomogram, size=kwargs.get('size', 3))
    else:
        raise ValueError(f"Unsupported denoising method: {method}")
  • Removes noise using Gaussian or median filter
  • Filter parameters can be flexibly adjusted via kwargs
name_map = {
    'apo-ferritin': 'ferritin_complex',
    'beta-amylase': 'beta_amylase',
    'beta-galactosidase': 'beta_galactosidase',
    'ribosome': 'cytosolic_ribosome',
    'thyroglobulin': 'thyroglobulin',
    'virus-like-particle': 'pp7_vlp',
}
def ndjson_to_json(ndjson_path):
    # Check if file exists
    if not os.path.isfile(ndjson_path):
        raise FileNotFoundError(f"The file {ndjson_path} does not exist.")

    data = []
    # Parse each line as JSON object
    try:
        with open(ndjson_path, 'r', encoding='utf-8') as ndjson_file:
            for line_number, line in enumerate(ndjson_file, start=1):
                stripped_line = line.strip()
                if stripped_line:  
                    try:
                        json_object = json.loads(stripped_line)
                        data.append(json_object)
                    except json.JSONDecodeError as e:
                        raise json.JSONDecodeError(
                            f"Error decoding JSON on line {line_number}: {e.msg}",
                            e.doc,
                            e.pos
                        )
    except Exception as e:
        raise e

    return data
  • Parses NDJSON (Newline Delimited JSON) files
  • Converts each line to individual JSON objects
  • Includes error handling and line number tracking
import os
import glob
import json
import pandas as pd
import numpy as np
import zarr
import cv2
from tqdm import tqdm

# Takes experiment name, train/validation flag, synthetic data flag as input
def make_annotate_yolo(run_name, is_train_path=True, is_syntetic=False):
    dataset_split = 'train' if is_train_path else 'val'
    
    # Loading and preprocessing volume data
    # Setting the path to the denoised volume(data)
    if is_syntetic:
        vol_path = glob.glob(f'/kaggle/input/czii10441/10441/{run_name}/**/Tomograms/**/*.zarr', recursive=True)
        if not vol_path:
            print(f"No volume found for run {run_name} in synthetic data.")
            return
        vol_path = vol_path[0]
    else:
        vol_path = f'/kaggle/input/czii-cryo-et-object-identification/train/static/ExperimentRuns/{run_name}/VoxelSpacing10.000/denoised.zarr'
    
    print(f"Volume path: {vol_path}")
    if not os.path.exists(vol_path):
        print(f"Volume file not found: {vol_path}")
        return

    # Read the volume
    vol = zarr.open(vol_path, mode='r') # loads volume data in zarr format
    vol = vol[0]
    if is_syntetic:
        vol = denoise_tomogram(np.array(vol)[:184], method='gaussian', sigma=1)  # Apply denoise for synthetic data
    vol2 = convert_to_8bit(vol) # into 8-bit format
    
    n_imgs = vol2.shape[0]
    print(n_imgs)
    
    # Image generation - CONVERT 3D Volume data into 2D Images that YOLO can process
    for j in range(n_imgs):
        # 1. Extract current slice
        newvol = vol2[j]
        
        # 2. Convert grayscale to RGB
        newvolf = np.stack([newvol]*3, axis=-1)
        
        # 3. Resize to YOLO input size
        newvolf = cv2.resize(newvolf, (640, 640))
        
        # 4. Save image
        image_filename = f'images/{dataset_split}/{run_name}_{j*10}.png'
        cv2.imwrite(image_filename, newvolf)
        
        # 5. Create empty label file
        label_filename = f'labels/{dataset_split}/{run_name}_{j*10}.txt'
        with open(label_filename, 'w') as f:
            pass
    
    # Process each particle type (label processing)
    for p, particle in enumerate(tqdm(particle_names, desc=f"Processing particles for run {run_name}")):
        if particle == "beta-amylase":
            continue
        
        # Load JSON data for each particle
        if is_syntetic:
            particle_name_in_file = name_map.get(particle)
            if not particle_name_in_file:
                print(f"Particle name mapping not found for: {particle}")
                continue
            
            ndjson_each_particle = glob.glob(f'/kaggle/input/czii10441/10441/{run_name}/**/Annotations/**/*.ndjson', recursive=True)
            if not ndjson_each_particle:
                print(f"No NDJSON files found for particle: {particle} in run: {run_name}")
                continue
            
            filtered_ndjson_files = [f for f in ndjson_each_particle if particle_name_in_file in f]
            if not filtered_ndjson_files:
                print(f"No NDJSON files match the particle: {particle} for run: {run_name}")
                continue
            
            json_each_particle = ndjson_to_json(filtered_ndjson_files[0])
            df = pd.DataFrame(json_each_particle)
            
        # Data loading for real data
        else:
            json_each_particle = f"/kaggle/input/czii-cryo-et-object-identification/train/overlay/ExperimentRuns/{run_name}/Picks/{particle}.json"
            
            if not os.path.exists(json_each_particle):
                print(f"JSON file not found: {json_each_particle}")
                continue
            print(f"Loading JSON file: {json_each_particle}")
            try:
                df = pd.read_json(json_each_particle)
            except ValueError as e:
                print(f"Error reading JSON file {json_each_particle}: {e}")
                continue
                
        # Coordinate Extraction Processing 
        if is_syntetic:
            column_name = 'location'
        else:
            column_name = 'points'

        if  column_name not in df.columns:
            print(f"'{column_name}' column not found in DataFrame for particle: {particle}")
            continue
        
        if is_syntetic:
            # Flattens nested JSON data into dataframe format
            normalized_data = pd.json_normalize(df[column_name])
            # *10.012: applies pixel scaling factor(converts to actual physical size)
            df[['x', 'y', 'z']] = normalized_data * 10.012
        
        # For real data: Extracts coordinates for each axis (x, y, z)
        else:      
            for axis in ["x", "y", "z"]:
                df[axis] = df[column_name].apply(lambda x: x["location"][axis] if "location" in x and axis in x["location"] else np.nan)
                print("aquiii",df.head())

        # Missing Value Handling: Removes rows with NaN in any of x, y, z coordinates
        df.dropna(subset=["x", "y", "z"], inplace=True)

        # Get defined radius for each particle type
        radius = particle_radius.get(particle)
        if radius is None:
            print(f"Radius not defined for particle: {particle}")
            continue
        divide_by = 10.012
        
        # Convert to YOLO format
        for i, row in df.iterrows():    
            # Calculate Z-axis range (range of slices where particle is visible)
            start_z = np.round(row['z'] - radius).astype(np.int32)
            start_z = max(0, start_z//10) 
            end_z = np.round(row['z'] + radius).astype(np.int32)
            end_z = min(n_imgs, end_z//10)
            
            # Generate YOLO format labels for each slice
            for j in range(start_z, end_z):
                label_filename = f'labels/{dataset_split}/{run_name}_{j*10}.txt'
                
                # Calculate normalized coordinates
                x_center = row["x"] / divide_by / vol2.shape[1]
                y_center = row["y"] / divide_by / vol2.shape[2]
                box_width = (radius * 2) / divide_by / vol2.shape[1]
                box_height = (radius * 2) / divide_by / vol2.shape[2]
                
                # Save in YOLO format
                # format: class_id center_x center_y width height
                with open(label_filename, 'a') as f:
                    f.write(f'{p2i_dict.get(particle, 0)} {x_center:.6f} {y_center:.6f} {box_width:.6f} {box_height:.6f}\n')
  • Generating datasets for YOLO training
  • Overall process:
    • Convert 3D coordinates to 2D YOLO format
    • Generate labels for all slices within particle's Z-axis range
    • Normalize coordinates and box sizes to 0-1 range
    • YOLO format: class_id x_center y_center width height
    • This code plays a crucial role in converting 3D particle location information into 2D bounding box format that YOLO can understand.
  • Image generation
    • newvolf = np.stack([newvol]*3, axis=-1)
      • [newvol]*3: Replicate the same grayscale image 3 times
      • axis=-1: Stack along the last dimension (creating R,G,B channels)
      • Result: (height, width) -> (height, width, 3)
    • Image resizing:
      • 640x640 is YOLOv5's default input size
  • Label Processing
    • Labels here refer to annotation information used for training YOLO object detection models
    • Exclude beta-amylase (excluded from competition evaluation)
    • Data Loading - For Synthetic Data
      • Synthetic data stored in NDJSON format
      • Filter by matching particle type in filename
      • Convert NDJSON to DataFrame
    • Data Loading - For Real Data
      • Real data stored in JSON format
      • Direct JSON file loading
    • Coordinate Extraction Processing
      • Different coordinate extraction methods for synthetic/real data
      • Normalize and scale coordinate values

5) Prepare folders

os.makedirs("images/train", exist_ok=True)
os.makedirs("images/val", exist_ok=True)
os.makedirs("labels/train", exist_ok=True)
os.makedirs("labels/val", exist_ok=True)
  • exist_ok=True: No error if directories already exist

6) Create Dataset

validation_indices = [0, 1, 2, 3]  # TS_5_4, TS_69_2 TS_6_4 TS_6_6

#runs = runs[:7] 
    
for i, r in enumerate(runs):
    # Determine if training or validation
    is_train_path = i not in validation_indices
    
    # Determine if synthetic data (after index 7 is synthetic)
    is_syntetic = i > 7
    
    print(f"Processing Run {i}: {r}, Is Train: {is_train_path}")
    
    # Call dataset generation function
    make_annotate_yolo(r, is_train_path=is_train_path, is_syntetic=is_syntetic)
  • Generates the dataset by splitting it into training and validation sets
images_train_dir = "images/train"
labels_train_dir = "labels/train"

7) Organize Dataset Folder Structure

# Create top-level dataset directory
os.makedirs('datasets/czii_det2d', exist_ok=True)

# Move image and label files to new locations
shutil.move('images/train', 'datasets/czii_det2d/images/train')
shutil.move('images/val', 'datasets/czii_det2d/images/val')
shutil.move('labels/train', 'datasets/czii_det2d/labels/train')
shutil.move('labels/val', 'datasets/czii_det2d/labels/val')
  • Reorganizes the generated training data into final directory structure expected by YOLO
  • Final dir structure:
    • datasets/
      └── czii_det2d/
          ├── images/
          │   ├── train/  # Training images
          │   └── val/    # Validation images
          └── labels/
              ├── train/  # Training labels
              └── val/    # Validation labels

8) Create Configuration File for YOLO

config_content = """
path: /kaggle/input/czii-making-datasets-for-yolo/datasets/czii_det2d  # Dataset root path
train: images/train  # Training images path (relative to path)
val: images/val      # Validation images path (relative to path)
# Classes
names:               # Class (particle type) definitions
  0: apo-ferritin
  1: beta-amylase
  2: beta-galactosidase
  3: ribosome
  4: thyroglobulin
  5: virus-like-particle
"""

# Create YAML file
with open("czii_conf.yaml", "w") as f:
    f.write(config_content.strip())

 

  • Generates a configuration file (YAML) for YOLO model training

In order to make the impossible possible, you need to change the rules.
- Elon Musk -
반응형