ParamManagerScripts/backend/script_groups/S7_DB_Utils/x2.py

import re
import os
import sys # Not strictly needed by this version but often kept from original
import glob
import pandas as pd # For Excel writing

# --- Functions for script operation ---

def find_working_directory():
    """
    Finds the working directory.
    Defaults to current directory. Adapt if specific configuration is needed.
    """
    print("Info: `find_working_directory_from_x1` is using the current directory.")
    return os.getcwd()

def extract_sections(content):
    """
    Extracts UDT definitions, main declaration section, and initialization section from S7 AWL/DB content.
    Uses re.IGNORECASE and re.DOTALL (via ?is) for matching keywords across different casings and newlines.
    """
    content = content.replace('\r\n', '\n') # Normalize line endings

    udt_definitions_content = ""
    # Regex to find TYPE...END_TYPE blocks (UDT definitions)
    udt_matches = list(re.finditer(r'(?is)(TYPE\s+.*?\s+END_TYPE\s*\n?)', content))

    content_after_udts = content
    if udt_matches:
        udt_definitions_content = "".join(match.group(0) for match in udt_matches)
        # Get content after the last UDT definition
        last_udt_end = udt_matches[-1].end()
        content_after_udts = content[last_udt_end:]

    header_text = "" # Placeholder, not actively used in this script's comparison logic
    rest_of_content_for_struct = content_after_udts

    # Try to find the main DATA_BLOCK header and the start of its STRUCT
    header_match = re.search(r'(?is)^(.*?(?:DATA_BLOCK.*?VERSION.*?\n))(.*?STRUCT)', content_after_udts)
    if header_match:
        # Content for further parsing starts at "STRUCT"
        rest_of_content_for_struct = content_after_udts[header_match.start(2):]
    else:
        # Fallback: find the first "STRUCT" if the specific header pattern isn't met
        header_fallback_match = re.search(r'(?is)(.*?)(STRUCT)', content_after_udts)
        if header_fallback_match:
            rest_of_content_for_struct = content_after_udts[header_fallback_match.start(2):]
        else:
            # If no STRUCT is found, declaration section will be empty
            print(f"Warning: No 'STRUCT' keyword found for main DB declarations in a content block.")

    # Declaration section: from the found STRUCT up to BEGIN
    decl_match = re.search(r'(?is)STRUCT\s*(.*?)BEGIN', rest_of_content_for_struct)
    decl_section = decl_match.group(1).strip() if decl_match else ""

    # Initialization section: from BEGIN up to END_DATA_BLOCK
    init_match = re.search(r'(?is)BEGIN\s*(.*?)END_DATA_BLOCK', rest_of_content_for_struct)
    init_section = init_match.group(1).strip() if init_match else ""

    # Footer after END_DATA_BLOCK isn't used
    return udt_definitions_content, header_text, decl_section, init_section, ""


def find_comparison_files_detailed(working_dir, data_suffix="_data", format_suffix="_format", updated_suffix_part="_updated"):
    """Finds data, format, and _updated files based on naming conventions."""
    all_files_in_dir = []
    for ext_pattern in ["*.db", "*.awl", "*.txt"]: # Common S7 export extensions
        all_files_in_dir.extend(glob.glob(os.path.join(working_dir, ext_pattern)))
    # Normalize paths for consistent comparisons and ensure uniqueness
    all_files_in_dir = sorted(list(set(os.path.normpath(f) for f in all_files_in_dir)))

    found_paths = {'data': None, 'format': None, 'updated': None}

    def select_best_file(file_list):
        if not file_list: return None
        # Prioritize: .db, then .awl, then .txt
        file_list.sort(key=lambda x: ('.db' not in x.lower(), '.awl' not in x.lower(), '.txt' not in x.lower()))
        return file_list[0]

    # Find _data file: contains data_suffix, does not contain updated_suffix_part
    data_candidates = [f for f in all_files_in_dir if data_suffix in os.path.basename(f).lower() and updated_suffix_part not in os.path.basename(f).lower()]
    found_paths['data'] = select_best_file(data_candidates)

    # Find _format file: contains format_suffix, does not contain updated_suffix_part
    format_candidates = [f for f in all_files_in_dir if format_suffix in os.path.basename(f).lower() and updated_suffix_part not in os.path.basename(f).lower()]
    if found_paths['data'] and format_candidates: # Ensure it's not the same as _data file
        format_candidates = [f for f in format_candidates if f != found_paths['data']]
    found_paths['format'] = select_best_file(format_candidates)

    # Find _updated file:
    # Strategy 1: Based on format_file name (most reliable if format_file found)
    if found_paths['format']:
        format_basename = os.path.basename(found_paths['format'])
        name_part, first_ext = os.path.splitext(format_basename)
        updated_basename_candidate = ""
        # Handle double extensions like ".db.txt" or ".awl.txt"
        if first_ext.lower() == ".txt" and ('.db' in name_part.lower() or '.awl' in name_part.lower()):
            base_name_for_main_ext, second_ext = os.path.splitext(name_part)
            updated_basename_candidate = base_name_for_main_ext + updated_suffix_part + second_ext + first_ext
        else: # Single extension
            updated_basename_candidate = name_part + updated_suffix_part + first_ext

        potential_updated_path = os.path.join(working_dir, updated_basename_candidate)
        if os.path.exists(potential_updated_path) and potential_updated_path in all_files_in_dir:
            found_paths['updated'] = potential_updated_path

    # Strategy 2: If not found by deriving from format_file, search more broadly
    if not found_paths['updated']:
        updated_candidates = [f for f in all_files_in_dir if updated_suffix_part in os.path.basename(f).lower()]
        if found_paths['format'] and updated_candidates: # Prefer updated file related to format file's base name
             format_base = os.path.basename(found_paths['format']).split(format_suffix)[0]
             updated_candidates = [f for f in updated_candidates if format_base in os.path.basename(f)]

        # Exclude already identified data and format files
        if found_paths['data'] and updated_candidates: updated_candidates = [f for f in updated_candidates if f != found_paths['data']]
        if found_paths['format'] and updated_candidates: updated_candidates = [f for f in updated_candidates if f != found_paths['format']]
        found_paths['updated'] = select_best_file(updated_candidates)

    print("Identified files for comparison:")
    for key, val in found_paths.items():
        print(f"  {key.capitalize()} file: {os.path.basename(val) if val else 'Not found'}")
    return found_paths['data'], found_paths['format'], found_paths['updated']


def get_variables_from_section_content(section_str, section_type="declaration"):
    """ Parses a declaration or initialization section string and returns a list of variable dicts. """
    variables = []
    idx = 0
    lines = section_str.replace('\r\n', '\n').split('\n')

    for line_content in lines:
        line = line_content.strip()
        if not line or line.startswith('//'): continue # Skip empty or comment lines

        line_upper = line.upper()
        # Skip lines that are purely structural (STRUCT, TYPE, END_STRUCT)
        # unless they also contain a full declaration/assignment on the same line.
        if (line_upper == 'STRUCT' or line_upper.startswith('TYPE ') or line_upper == 'END_STRUCT' or line_upper == 'BEGIN' or line_upper == 'END_DATA_BLOCK'):
             if not (':' in line and ';' in line or ':=' in line and ';' in line ): # if not also a var line
                continue

        var_name, var_type, value = None, None, None

        if section_type == "declaration": # Expect: VarName : VarType [:= InitialValue] ;
            if ':' in line and ';' in line:
                # Name: part before ':' (handles simple and "quoted" names)
                name_match = re.match(r'^\s*(\"(?:\\\"|[^\"])*\"|[a-zA-Z_][\w]*)', line, re.IGNORECASE)
                var_name = name_match.group(1).strip().replace('"', "") if name_match else None

                # Type: part between ':' and potential ':=' or ';' (handles "UDT", simple, ARRAY)
                type_match = re.search(r':\s*(\"[^\"]+\"|[^:=;]+)', line, re.IGNORECASE)
                var_type = type_match.group(1).strip().replace('"', "") if type_match else None

                # Value: part between ':=' and ';'
                assign_match = re.search(r':=\s*([^;]+)', line, re.IGNORECASE)
                if assign_match: value = assign_match.group(1).strip()

                if not var_name or not var_type: continue # Must have name and type for a declaration
            else: continue # Not a declaration line by this rule

        elif section_type == "initialization": # Expect: VarNameOrPath := Value ;
            if ':=' in line and ';' in line:
                # Name/Path: part before ':=' (handles "Quoted.Path", Simple.Path, Array[1].Path)
                name_match = re.match(r'^\s*(\"(?:\\\"|[^\"])*\"|[a-zA-Z_][\w"\[\],\.]*(?:\[.*?\]|\.[a-zA-Z_][\w"\[\],\.]*)*)\s*:=', line, re.IGNORECASE)
                var_name = name_match.group(1).strip().replace('"', "") if name_match else None

                # Value: part between ':=' and ';'
                value_match = re.search(r':=\s*([^;]+)', line, re.IGNORECASE)
                value = value_match.group(1).strip() if value_match else None

                if not var_name or value is None : continue # Must have name and value for assignment
            else: continue # Not an assignment line

        if var_name is not None: # If a name was captured (and other conditions met), record it
            variables.append({
                "index": idx, "name": var_name, "type": var_type, "value": value,
                "original_line": line_content
            })
            idx += 1
    return variables

def process_file_for_vars(file_path):
    """
    Reads a file, extracts main STRUCT declarations and BEGIN block initializations.
    UDT definitions themselves are not included in the returned `main_struct_decl_vars`.
    """
    if not file_path or not os.path.exists(file_path):
        return [], [] # Return empty lists if file not found

    try:
        with open(file_path, 'r', encoding='utf-8-sig') as f: # utf-8-sig handles BOM
            content = f.read()
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return [], []

    # udt_definitions_content is extracted but not directly used for the comparison lists below
    _udt_definitions_content, _header, decl_content_main, init_content, _footer = extract_sections(content)

    # "main_struct_decl_vars" are from the main DATA_BLOCK's STRUCT section (initial values).
    main_struct_decl_vars = get_variables_from_section_content(decl_content_main, "declaration")

    # "begin_block_init_vars" are from the BEGIN...END_DATA_BLOCK section (current values).
    begin_block_init_vars = get_variables_from_section_content(init_content, "initialization")

    return main_struct_decl_vars, begin_block_init_vars


def generate_excel_comparison(data_file, format_file, updated_file, output_excel_path):
    """Generates an Excel file with two sheets comparing variables from three source files."""

    print(f"\nProcessing _data file: {os.path.basename(data_file) if data_file else 'N/A'}")
    data_decl_vars, data_init_vars = process_file_for_vars(data_file)
    print(f"  Found {len(data_decl_vars)} declaration vars, {len(data_init_vars)} initialization vars in _data file.")

    print(f"Processing _format file: {os.path.basename(format_file) if format_file else 'N/A'}")
    format_decl_vars, format_init_vars = process_file_for_vars(format_file)
    print(f"  Found {len(format_decl_vars)} declaration vars, {len(format_init_vars)} initialization vars in _format file.")

    print(f"Processing _updated file: {os.path.basename(updated_file) if updated_file else 'N/A'}")
    updated_decl_vars, updated_init_vars = process_file_for_vars(updated_file)
    print(f"  Found {len(updated_decl_vars)} declaration vars, {len(updated_init_vars)} initialization vars in _updated file.")


    placeholder_var = {"name": "", "type": "", "value": "", "original_line": ""}
    # Define column order once, will be used for both sheets
    column_order = ["Variable Name (_data / _format)", "Data Type", "Value (_data)", "Value (_format)", "Value (_updated)"]

    # --- Prepare data for "Declarations (Initial Values)" sheet ---
    decl_excel_rows = []
    # Determine max length for declaration rows based on non-empty lists
    decl_lengths = [len(lst) for lst in [data_decl_vars, format_decl_vars, updated_decl_vars] if lst is not None]
    max_decl_len = max(decl_lengths) if decl_lengths else 0

    print(f"\nComparing {max_decl_len} positional declaration entries (STRUCT section)...")
    for i in range(max_decl_len):
        var_d = data_decl_vars[i] if data_decl_vars and i < len(data_decl_vars) else placeholder_var
        var_f = format_decl_vars[i] if format_decl_vars and i < len(format_decl_vars) else placeholder_var
        var_u = updated_decl_vars[i] if updated_decl_vars and i < len(updated_decl_vars) else placeholder_var

        # Construct combined name
        name_d_str = var_d['name'] if var_d['name'] else ""
        name_f_str = var_f['name'] if var_f['name'] else ""
        combined_name = f"{name_d_str} / {name_f_str}".strip(" /")
        if not combined_name: combined_name = var_u['name'] or name_d_str or name_f_str # Fallback

        # Determine Data Type: Priority: format, then updated, then data
        type_to_use = var_f['type'] or var_u['type'] or var_d['type'] or "N/A"

        decl_excel_rows.append({
            "Variable Name (_data / _format)": combined_name,
            "Data Type": type_to_use,
            "Value (_data)": str(var_d['value']) if var_d['value'] is not None else "",
            "Value (_format)": str(var_f['value']) if var_f['value'] is not None else "",
            "Value (_updated)": str(var_u['value']) if var_u['value'] is not None else ""
        })
    df_declarations = pd.DataFrame(decl_excel_rows)
    if not df_declarations.empty: # Apply column order if DataFrame is not empty
        for col in column_order:
            if col not in df_declarations.columns: df_declarations[col] = "" # Ensure all columns exist
        df_declarations = df_declarations[column_order]


    # --- Prepare data for "Initializations (Current Values)" sheet ---
    init_excel_rows = []
    init_lengths = [len(lst) for lst in [data_init_vars, format_init_vars, updated_init_vars] if lst is not None]
    max_init_len = max(init_lengths) if init_lengths else 0

    print(f"Comparing {max_init_len} positional initialization entries (BEGIN block)...")
    for i in range(max_init_len):
        var_d = data_init_vars[i] if data_init_vars and i < len(data_init_vars) else placeholder_var
        var_f = format_init_vars[i] if format_init_vars and i < len(format_init_vars) else placeholder_var
        var_u = updated_init_vars[i] if updated_init_vars and i < len(updated_init_vars) else placeholder_var

        name_d_str = var_d['name'] if var_d['name'] else ""
        name_f_str = var_f['name'] if var_f['name'] else ""
        combined_name = f"{name_d_str} / {name_f_str}".strip(" /")
        if not combined_name: combined_name = var_u['name'] or name_d_str or name_f_str

        init_excel_rows.append({
            "Variable Name (_data / _format)": combined_name,
            "Data Type": "N/A", # Type is not usually re-declared in initialization lines
            "Value (_data)": str(var_d['value']) if var_d['value'] is not None else "",
            "Value (_format)": str(var_f['value']) if var_f['value'] is not None else "",
            "Value (_updated)": str(var_u['value']) if var_u['value'] is not None else ""
        })
    df_initializations = pd.DataFrame(init_excel_rows)
    if not df_initializations.empty: # Apply column order
        for col in column_order:
            if col not in df_initializations.columns: df_initializations[col] = ""
        df_initializations = df_initializations[column_order]

    # --- Write to Excel with two sheets ---
    try:
        with pd.ExcelWriter(output_excel_path, engine='openpyxl') as writer:
            if not df_declarations.empty:
                df_declarations.to_excel(writer, sheet_name='Declarations (Initial Values)', index=False)
                print(f"Written 'Declarations (Initial Values)' sheet with {len(df_declarations)} rows.")
            else:
                print("No data for 'Declarations (Initial Values)' sheet.")

            if not df_initializations.empty:
                df_initializations.to_excel(writer, sheet_name='Initializations (Current Values)', index=False)
                print(f"Written 'Initializations (Current Values)' sheet with {len(df_initializations)} rows.")
            else:
                print("No data for 'Initializations (Current Values)' sheet.")

        if df_declarations.empty and df_initializations.empty:
            print("No data written to Excel as both datasets are empty.")
        else:
            print(f"\nSuccessfully generated Excel comparison: {output_excel_path}")

    except Exception as e:
        print(f"Error writing Excel file {output_excel_path}: {e}")


def main_comparator():
    print("S7 Data Block Comparator to Excel (Multi-Sheet)")
    print("==============================================")
    working_dir = find_working_directory()
    print(f"Using working directory: {working_dir}")

    data_f, format_f, updated_f = find_comparison_files_detailed(working_dir)

    if not any([data_f, format_f, updated_f]): # Check if at least one relevant file was found
        print("\nError: Could not find a sufficient set of input files (_data, _format, _updated). Exiting.")
        return

    output_filename = "S7_DB_Comparison_MultiSheet.xlsx"
    output_excel_file = os.path.join(working_dir, output_filename)

    generate_excel_comparison(data_f, format_f, updated_f, output_excel_file)

if __name__ == "__main__":
    main_comparator()