convert_pascal_to_googlecsv.py

import argparse
import csv
import os
import xml.etree.ElementTree as ET
import random

def pascal_voc_to_mlflow_csv(pascal_voc_file, mlflow_csv_file):
    """
    Converts a label list generated by labelImg in Pascal VOC format into MLFlow CSV format used by Google.

    Parameters:
    pascal_voc_file (str): Path to the Pascal VOC file to convert.
    mlflow_csv_file (str): Path to the MLFlow CSV file to create.
    """
    with open(pascal_voc_file, 'r') as f:
        tree = ET.parse(f)
        root = tree.getroot()

        width = int(root.find('size').find('width').text)
        height = int(root.find('size').find('height').text)
        
        parent_dir = os.path.dirname(os.path.dirname(pascal_voc_file))
        images_dir_name = root.find('folder').text
        if images_dir_name is None:
            images_dir_name = ""
        image_path = os.path.join(parent_dir, images_dir_name, root.find('filename').text)
        
        with open(mlflow_csv_file, 'a', newline='') as mlflow_csv:
            writer = csv.writer(mlflow_csv, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            if os.path.getsize(mlflow_csv_file) == 0:
                writer.writerow(['path', 'label', 'x_min', 'y_min', '', '', 'x_max', 'y_max', '', ''])


            for obj in root.findall('object'):
                bbox = obj.find('bndbox')
                x_min = round(float(bbox.find('xmin').text) / width, 6)
                y_min = round(float(bbox.find('ymin').text) / height, 6)
                x_max = round(float(bbox.find('xmax').text) / width, 6)
                y_max = round(float(bbox.find('ymax').text) / height, 6)
                writer.writerow([
                    os.path.join(parent_dir, images_dir_name, root.find('filename').text),
                    obj.find('name').text,
                    x_min,
                    y_min,
                    '',
                    '',
                    x_max,
                    y_max,
                    '',
                    ''
                ])
def merge_csv_files(file_list, output_file_name):
    output_folder = os.path.dirname(file_list[0])
    output_file_path = os.path.join(output_folder, output_file_name)
    
    file_exists = os.path.isfile(output_file_path)

    # If the file exists, add an increment number to the name
    if file_exists:
        base_name, extension = os.path.splitext(output_file_name)
        increment = 1
        while file_exists:
            incremented_name = f"{base_name}{increment}{extension}"
            output_file_path = os.path.join(output_folder, incremented_name)
            file_exists = os.path.isfile(output_file_path)
            increment += 1
    
    with open(output_file_path, 'w', newline='') as output_file:
        writer = csv.writer(output_file)
        
        # Write the header from the first file
        with open(file_list[0], 'r') as first_file:
            reader = csv.reader(first_file)
            header = next(reader)
            writer.writerow(header)
        
        # Write the data from all files
        for file_name in file_list:
            with open(file_name, 'r') as input_file:
                reader = csv.reader(input_file)
                next(reader)  # Skip the header row
                for row in reader:
                    writer.writerow(row)
            os.remove(file_name)        
    
    print(f"Merged CSV file saved as {output_file_path}")

def create_train_test_val_split(merged_file_path):
    # Read the merged CSV file and group annotations by file
    file_data = {}
    with open(merged_file_path, 'r') as merged_file:
        reader = csv.reader(merged_file)
        header = next(reader)
        for row in reader:
            file_name = row[0]
            if file_name not in file_data:
                file_data[file_name] = []
            file_data[file_name].append(row)
    
    # Shuffle the file names
    file_names = list(file_data.keys())
    random.shuffle(file_names)
    
    # Prompt user for train-test-validation split
    create_split = input("Create train-test-validation split? (y/n): ")
    if create_split.lower() in ['y', 'yes']:
        proportions_valid = False
        while not proportions_valid:
            train_proportion = input("Enter the proportion for train set (0 to 1, default 0.8): ")
            test_proportion = input("Enter the proportion for test set (0 to 1, default 0.1): ")
            val_proportion = input("Enter the proportion for validation set (0 to 1, default 0.1): ")
            
            if train_proportion == '':
                train_proportion = 0.8
            else:
                train_proportion = float(train_proportion)
            
            if test_proportion == '':
                test_proportion = 0.1
            else:
                test_proportion = float(test_proportion)
            
            if val_proportion == '':
                val_proportion = 0.1
            else:
                val_proportion = float(val_proportion)
            
            if train_proportion + test_proportion + val_proportion != 1:
                print("Error: The sum of proportions must be equal to 1.")
            else:
                proportions_valid = True
                
        # Calculate split indices
        num_files = len(file_names)
        train_split_index = int(num_files * train_proportion)
        test_split_index = train_split_index + int(num_files * test_proportion)
        
        # Assign splits
        for i, file_name in enumerate(file_names):
            if i < train_split_index:
                split = 'TRAIN'
            elif i < test_split_index:
                split = 'TEST'
            else:
                split = 'VALIDATE'
            
            # Update the 'Split' column for each annotation of the file
            for row in file_data[file_name]:
                row.insert(0, split)
    
        # Write the split data to the merged CSV file
        with open(merged_file_path, 'w', newline='') as merged_file:
            writer = csv.writer(merged_file)
            writer.writerow(header)
            for file_name in file_names:
                for row in file_data[file_name]:
                    writer.writerow(row)
                    
    # Delete the first line (header) from the merged CSV file
    with open(merged_file_path, 'r') as file:
        lines = file.readlines()[1:]
    
    with open(merged_file_path, 'w', newline='') as file:
        file.writelines(lines)                   
        
        print(f"Train-test-validation split applied to {merged_file_path}")
    


def shuffle_splits(merged_file_path, merge_classes):
    # Read the merged CSV file and gather class names
    class_names = set()
    merged_file_name, merged_file_ext = os.path.splitext(merged_file_path)
    shuffled_file_path = f"{merged_file_name}_shuffled{merged_file_ext}"
    
    # Check if automatic class merge is enabled
    if merge_classes:
        merge_mode = input("Enable automatic class merge? (y/n): ")
        if merge_mode.lower() == 'y':
            merge_file_path = input("Enter the URL to the merge file: ")
            class_mapping = {}
            
            # Read the class mapping file
            with open(merge_file_path, 'r') as merge_file:
                for line in merge_file:
                    print(line)
                    line = line.strip()
                    if line:
                        class_name, merge_to = line.split(':')
                        class_mapping[class_name] = merge_to.strip()
            
            print("Performing automatic class merge using the mapping file.")
            print(class_mapping)
            
            # Replace class names based on the mapping file
            with open(merged_file_path, 'r') as merged_file:
                rows = list(csv.reader(merged_file))
                for row in rows:
                    if row[2] in class_mapping:
                        row[2] = class_mapping[row[2]]
            
            # Write the modified data to the shuffled file
            with open(shuffled_file_path, 'w', newline='') as shuffled_file:
                writer = csv.writer(shuffled_file)
                writer.writerows(rows)
            
            merged_file_path = shuffled_file_path
        else:
            print("Skipping automatic class merge.")
    
    # If automatic class merge is not enabled or completed, ask for manual class merge
    if not merge_classes or (merge_classes and merge_mode.lower() != 'y'):
        # get list of all classnames
        with open(merged_file_path, 'r') as merged_file:
            reader = csv.reader(merged_file)
            header = next(reader)
            for row in reader:
                class_name = row[1]
                if "." in class_name:
                    class_name = row[2] 
                class_names.add(class_name)    
        
        class_list = input("Enter a list of classes to merge (separated by comma), or press Enter to skip: ")
        
        if class_list:
            class_list = [c.strip() for c in class_list.split(",")]

            if class_list[0].lower().startswith("not:"):
                class_list[0] = class_list[0].replace("not:", "").strip()
                not_class_list = class_list
                print(not_class_list)
                class_list = list(set(class_names) - set(not_class_list))
            else:
                not_class_list = []

            print(f"Merging classes {class_list} to 'other'")

            # Replace class names with 'other'
            with open(merged_file_path, 'r') as merged_file:
                rows = list(csv.reader(merged_file))
                for row in rows:
                    cls_index = 1
                    if "." in row[cls_index]:
                        cls_index = 2
                    if row[cls_index] in class_list:
                        row[cls_index] = "other"

            # Write the modified data to the shuffled file
            with open(shuffled_file_path, 'w', newline='') as shuffled_file:
                writer = csv.writer(shuffled_file)
                writer.writerows(rows)

            merged_file_path = shuffled_file_path
        else:
            print("Skipping manual class merge.")

    # Shuffle the splits and generate a new train-test-validation split
    create_train_test_val_split(merged_file_path)

def main(directory, merged_csv):
    if merged_csv:
        merge_classes = input("Do you want to merge classes? (y/n): ").lower() == 'y'
        if merge_classes:
            shuffle_splits(merged_csv, merge_classes)
        else:
            shuffle_splits(merged_csv, False)
    else:
        file_list = []
        for file in os.listdir(directory):
            if file.endswith('.xml'):
                output_file = os.path.splitext(file)[0] + '_mlflow.csv'
                pascal_voc_file = os.path.join(directory, file)
                file_list.append(output_file)
                pascal_voc_to_mlflow_csv(pascal_voc_file, output_file)
        output_file_name = 'merged_annotations_mlflow.csv'
        merge_csv_files(file_list, output_file_name)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Converts Pascal VOC files to MLFlow CSV format and reshuffles splits')
    parser.add_argument('directory', type=str, nargs='?', help='Path to the directory containing the Pascal VOC files to convert', default='.')
    parser.add_argument('--merged_csv', type=str, help='Path to the already merged CSV file')
    args = parser.parse_args()
    main(args.directory, args.merged_csv)