feat: Enhance event logging, improve CSV handling, and optimize Chart.js zoom functionality

This commit is contained in:
Miguel 2025-08-15 00:42:14 +02:00
parent e517f40a5d
commit b864e81aa3
6 changed files with 693 additions and 172 deletions

View File

@ -3928,8 +3928,311 @@
"trigger_variable": null,
"auto_started": true
}
},
{
"timestamp": "2025-08-15T00:28:49.611975",
"level": "info",
"event_type": "plot_session_created",
"message": "Plot session 'UR29' created and started",
"details": {
"session_id": "plot_1",
"variables": [
"UR29_Brix",
"UR29_ma",
"AUX Blink_1.0S"
],
"time_window": 500,
"trigger_variable": null,
"auto_started": true
}
},
{
"timestamp": "2025-08-15T00:29:11.885234",
"level": "info",
"event_type": "plot_session_created",
"message": "Plot session 'UR29' created and started",
"details": {
"session_id": "plot_1",
"variables": [
"UR29_Brix",
"UR29_ma",
"AUX Blink_1.0S"
],
"time_window": 500,
"trigger_variable": null,
"auto_started": true
}
},
{
"timestamp": "2025-08-15T00:29:47.127915",
"level": "info",
"event_type": "plot_session_created",
"message": "Plot session 'UR29' created and started",
"details": {
"session_id": "plot_1",
"variables": [
"UR29_Brix",
"UR29_ma",
"AUX Blink_1.0S"
],
"time_window": 500,
"trigger_variable": null,
"auto_started": true
}
},
{
"timestamp": "2025-08-15T00:33:00.587011",
"level": "info",
"event_type": "plot_session_created",
"message": "Plot session 'UR29' created and started",
"details": {
"session_id": "plot_1",
"variables": [
"UR29_Brix",
"UR29_ma",
"AUX Blink_1.0S"
],
"time_window": 500,
"trigger_variable": null,
"auto_started": true
}
},
{
"timestamp": "2025-08-15T00:34:00.990295",
"level": "info",
"event_type": "application_started",
"message": "Application initialization completed successfully",
"details": {}
},
{
"timestamp": "2025-08-15T00:34:01.069801",
"level": "info",
"event_type": "dataset_activated",
"message": "Dataset activated: DAR",
"details": {
"dataset_id": "DAR",
"variables_count": 2,
"streaming_count": 2,
"prefix": "gateway_phoenix"
}
},
{
"timestamp": "2025-08-15T00:34:01.075470",
"level": "info",
"event_type": "dataset_activated",
"message": "Dataset activated: Fast",
"details": {
"dataset_id": "Fast",
"variables_count": 2,
"streaming_count": 1,
"prefix": "fast"
}
},
{
"timestamp": "2025-08-15T00:34:01.083475",
"level": "info",
"event_type": "csv_recording_started",
"message": "CSV recording started: 2 datasets activated",
"details": {
"activated_datasets": 2,
"total_datasets": 3
}
},
{
"timestamp": "2025-08-15T00:35:06.741557",
"level": "info",
"event_type": "plot_session_created",
"message": "Plot session 'UR29' created and started",
"details": {
"session_id": "plot_1",
"variables": [
"UR29_Brix",
"UR29_ma",
"AUX Blink_1.0S"
],
"time_window": 500,
"trigger_variable": null,
"auto_started": true
}
},
{
"timestamp": "2025-08-15T00:35:28.675944",
"level": "info",
"event_type": "application_started",
"message": "Application initialization completed successfully",
"details": {}
},
{
"timestamp": "2025-08-15T00:35:28.758707",
"level": "info",
"event_type": "dataset_activated",
"message": "Dataset activated: DAR",
"details": {
"dataset_id": "DAR",
"variables_count": 2,
"streaming_count": 2,
"prefix": "gateway_phoenix"
}
},
{
"timestamp": "2025-08-15T00:35:28.764039",
"level": "info",
"event_type": "dataset_activated",
"message": "Dataset activated: Fast",
"details": {
"dataset_id": "Fast",
"variables_count": 2,
"streaming_count": 1,
"prefix": "fast"
}
},
{
"timestamp": "2025-08-15T00:35:28.768130",
"level": "info",
"event_type": "csv_recording_started",
"message": "CSV recording started: 2 datasets activated",
"details": {
"activated_datasets": 2,
"total_datasets": 3
}
},
{
"timestamp": "2025-08-15T00:35:39.619619",
"level": "info",
"event_type": "plot_session_created",
"message": "Plot session 'UR29' created and started",
"details": {
"session_id": "plot_1",
"variables": [
"UR29_Brix",
"UR29_ma",
"AUX Blink_1.0S"
],
"time_window": 500,
"trigger_variable": null,
"auto_started": true
}
},
{
"timestamp": "2025-08-15T00:40:08.431291",
"level": "info",
"event_type": "plot_session_created",
"message": "Plot session 'UR29' created and started",
"details": {
"session_id": "plot_1",
"variables": [
"UR29_Brix",
"UR29_ma",
"AUX Blink_1.0S"
],
"time_window": 500,
"trigger_variable": null,
"auto_started": true
}
},
{
"timestamp": "2025-08-15T00:40:14.468124",
"level": "info",
"event_type": "plot_session_created",
"message": "Plot session 'UR29' created and started",
"details": {
"session_id": "plot_1",
"variables": [
"UR29_Brix",
"UR29_ma",
"AUX Blink_1.0S"
],
"time_window": 500,
"trigger_variable": null,
"auto_started": true
}
},
{
"timestamp": "2025-08-15T00:40:41.861858",
"level": "info",
"event_type": "plot_session_created",
"message": "Plot session 'UR29' created and started",
"details": {
"session_id": "plot_1",
"variables": [
"UR29_Brix",
"UR29_ma",
"AUX Blink_1.0S"
],
"time_window": 500,
"trigger_variable": null,
"auto_started": true
}
},
{
"timestamp": "2025-08-15T00:40:45.084903",
"level": "info",
"event_type": "plot_session_created",
"message": "Plot session 'UR29' created and started",
"details": {
"session_id": "plot_1",
"variables": [
"UR29_Brix",
"UR29_ma",
"AUX Blink_1.0S"
],
"time_window": 500,
"trigger_variable": null,
"auto_started": true
}
},
{
"timestamp": "2025-08-15T00:40:49.549459",
"level": "info",
"event_type": "plot_session_created",
"message": "Plot session 'UR29' created and started",
"details": {
"session_id": "plot_1",
"variables": [
"UR29_Brix",
"UR29_ma",
"AUX Blink_1.0S"
],
"time_window": 500,
"trigger_variable": null,
"auto_started": true
}
},
{
"timestamp": "2025-08-15T00:41:19.890329",
"level": "info",
"event_type": "plot_session_created",
"message": "Plot session 'UR29' created and started",
"details": {
"session_id": "plot_1",
"variables": [
"UR29_Brix",
"UR29_ma",
"AUX Blink_1.0S"
],
"time_window": 100,
"trigger_variable": null,
"auto_started": true
}
},
{
"timestamp": "2025-08-15T00:41:32.191310",
"level": "info",
"event_type": "plot_session_created",
"message": "Plot session 'UR29' created and started",
"details": {
"session_id": "plot_1",
"variables": [
"UR29_Brix",
"UR29_ma",
"AUX Blink_1.0S"
],
"time_window": 3600,
"trigger_variable": null,
"auto_started": true
}
}
],
"last_updated": "2025-08-15T00:27:40.211134",
"total_entries": 360
"last_updated": "2025-08-15T00:41:32.191310",
"total_entries": 381
}

View File

@ -2,12 +2,12 @@
"plots": [
{
"id": "plot_1",
"line_tension": 0.1,
"line_tension": 0,
"name": "UR29",
"point_hover_radius": 4,
"point_radius": 0,
"stepped": false,
"time_window": 500,
"stepped": true,
"time_window": 3600,
"trigger_enabled": false,
"trigger_on_true": true,
"trigger_variable": null,

View File

@ -67,6 +67,9 @@ const ChartjsPlot = ({ session, height = '400px' }) => {
const [isRefreshing, setIsRefreshing] = useState(false);
const [isLoadingHistorical, setIsLoadingHistorical] = useState(false);
const resolvedConfigRef = useRef(null);
// Data preservation system for zoom operations
const dataBackupRef = useRef(new Map());
const bgColor = useColorModeValue('white', 'gray.800');
const textColor = useColorModeValue('gray.600', 'gray.300');
@ -580,17 +583,55 @@ const ChartjsPlot = ({ session, height = '400px' }) => {
},
...(zoomAvailable ? {
zoom: {
// Solo habilitar zoom/pan en modo fullscreen
// Habilitar zoom/pan siempre, pero con configuración optimizada para streaming
pan: {
enabled: !!session?.isFullscreen,
enabled: true,
mode: 'x',
modifierKey: 'shift'
modifierKey: 'shift',
onPanComplete: function(context) {
// Preserve streaming state after pan
const chart = context.chart;
if (chart.options?.scales?.x?.realtime) {
const realtime = chart.options.scales.x.realtime;
// Ensure streaming continues after pan
if (!sessionDataRef.current.userPaused && !sessionDataRef.current.isPaused) {
realtime.pause = false;
}
}
}
},
zoom: {
drag: { enabled: !!session?.isFullscreen },
wheel: { enabled: !!session?.isFullscreen },
pinch: { enabled: !!session?.isFullscreen },
mode: 'x'
drag: {
enabled: true,
backgroundColor: 'rgba(128,128,128,0.3)'
},
wheel: {
enabled: true,
speed: 0.1
},
pinch: { enabled: true },
mode: 'x',
onZoomComplete: function(context) {
// Preserve streaming state and data after zoom
const chart = context.chart;
console.log('🔍 Zoom completed, preserving streaming state...');
if (chart.options?.scales?.x?.realtime) {
const realtime = chart.options.scales.x.realtime;
// Don't auto-pause streaming after zoom
if (!sessionDataRef.current.userPaused && !sessionDataRef.current.isPaused) {
setTimeout(() => {
realtime.pause = false;
}, 10);
}
}
}
},
limits: {
x: {
min: 'original',
max: 'original'
}
}
}
} : {})
@ -830,6 +871,18 @@ const ChartjsPlot = ({ session, height = '400px' }) => {
// Update chart
if (pointsAdded > 0) {
chart.update('quiet');
// Backup data periodically when significant data is added
if (pointsAdded > 5 && dataBackupRef.current) {
const backup = chart.data.datasets.map(dataset => ({
label: dataset.label,
data: [...(dataset.data || [])],
timestamp: Date.now()
}));
dataBackupRef.current.set('current', backup);
console.log(`📦 Data backed up: ${backup.reduce((total, ds) => total + ds.data.length, 0)} points`);
}
}
// Clear NaN insertion flag
@ -982,17 +1035,78 @@ const ChartjsPlot = ({ session, height = '400px' }) => {
if (!chartRef.current) return;
try {
console.log('🔄 Resetting zoom...');
// Backup data before zoom reset
const backup = chartRef.current.data.datasets.map(dataset => ({
label: dataset.label,
data: [...(dataset.data || [])],
timestamp: Date.now()
}));
dataBackupRef.current.set('current', backup);
console.log(`📦 Data backed up: ${backup.reduce((total, ds) => total + ds.data.length, 0)} points`);
// Store current streaming state
const realtimeOptions = chartRef.current.options?.scales?.x?.realtime;
const wasPaused = realtimeOptions?.pause || false;
// Try to reset zoom using the zoom plugin
if (chartRef.current.resetZoom) {
chartRef.current.resetZoom();
chartRef.current.resetZoom('none'); // Use 'none' animation mode for faster reset
} else if (window.Chart?.helpers?.getRelativePosition) {
// Fallback: manually reset zoom by updating scale options
const chart = chartRef.current;
if (chart.options?.scales?.x?.realtime) {
// For realtime charts, just trigger an update
// For realtime charts, reset the scale manually
const now = Date.now();
chart.options.scales.x.realtime.duration = chart.options.scales.x.realtime.duration || 60000;
chart.options.scales.x.min = now - chart.options.scales.x.realtime.duration;
chart.options.scales.x.max = now;
chart.update('none');
}
}
// Restore data if it was lost during zoom reset
setTimeout(() => {
if (chartRef.current && dataBackupRef.current.has('current')) {
const backupData = dataBackupRef.current.get('current');
const now = Date.now();
// Only restore if backup is recent (within 30 seconds)
if (now - backupData[0]?.timestamp <= 30000) {
let restored = false;
chartRef.current.data.datasets.forEach((dataset, index) => {
const backupDataset = backupData[index];
if (backupDataset && dataset.label === backupDataset.label) {
// Only restore if current data is significantly smaller
if (!dataset.data || dataset.data.length < backupDataset.data.length * 0.3) {
dataset.data = [...backupDataset.data];
restored = true;
}
}
});
if (restored) {
console.log('🔄 Chart data restored from backup');
chartRef.current.update('none');
const totalPoints = chartRef.current.data.datasets.reduce((total, dataset) =>
total + (dataset.data?.length || 0), 0
);
setDataPointsCount(totalPoints);
}
}
// Restore streaming state
if (realtimeOptions && !wasPaused) {
realtimeOptions.pause = false;
}
console.log('✅ Zoom reset with data preservation complete');
}
}, 100); // Small delay to ensure zoom reset is complete
} catch (error) {
console.warn('Failed to reset zoom:', error);
}
@ -1339,6 +1453,35 @@ const ChartjsPlot = ({ session, height = '400px' }) => {
</Box>
)}
{/* Reset Zoom Button */}
{chartRef.current && (
<Box
position="absolute"
bottom={2}
right={2}
zIndex={10}
>
<button
onClick={resetZoom}
style={{
background: 'rgba(0, 0, 0, 0.7)',
color: 'white',
border: 'none',
borderRadius: '4px',
padding: '4px 8px',
fontSize: '11px',
cursor: 'pointer',
display: 'flex',
alignItems: 'center',
gap: '4px'
}}
title="Reset zoom and restore data"
>
🔄 Reset Zoom
</button>
</Box>
)}
<canvas
ref={canvasRef}
style={{

97
main.py
View File

@ -1925,27 +1925,37 @@ def get_historical_data():
# Read first line to check if any required variables are present with proper encoding handling
header_line = None
for encoding in ['utf-8', 'utf-8-sig', 'utf-16', 'latin-1']:
for encoding in ["utf-8", "utf-8-sig", "utf-16", "latin-1"]:
try:
with open(csv_file, "r", encoding=encoding) as f:
header_line = f.readline().strip()
if header_line:
print(f"🔍 DEBUG: Successfully read header with {encoding} encoding")
print(
f"🔍 DEBUG: Successfully read header with {encoding} encoding"
)
break
except (UnicodeDecodeError, UnicodeError):
continue
except Exception:
continue
if not header_line:
print(f"🔍 DEBUG: Could not read header from {csv_file}, skipping")
print(
f"🔍 DEBUG: Could not read header from {csv_file}, skipping"
)
continue
# Clean header line from BOM and normalize
header_line = header_line.replace('\ufeff', '').replace('\x00', '')
headers = [h.strip().replace('\x00', '') for h in header_line.split(",")]
header_line = header_line.replace("\ufeff", "").replace("\x00", "")
headers = [
h.strip().replace("\x00", "") for h in header_line.split(",")
]
# Clean any remaining unicode artifacts
headers = [h for h in headers if h and len(h.replace('\x00', '').strip()) > 0]
headers = [
h
for h in headers
if h and len(h.replace("\x00", "").strip()) > 0
]
print(f"🔍 DEBUG: Headers in {csv_file}: {headers}")
# Check if any of our variables are in this file
@ -1961,65 +1971,90 @@ def get_historical_data():
# Read the CSV file with proper encoding and error handling
print(f"🔍 DEBUG: Reading CSV file with pandas...")
df = None
for encoding in ['utf-8', 'utf-8-sig', 'utf-16', 'latin-1']:
for encoding in ["utf-8", "utf-8-sig", "utf-16", "latin-1"]:
try:
df = pd.read_csv(csv_file, encoding=encoding, on_bad_lines='skip')
print(f"🔍 DEBUG: CSV loaded with {encoding}, shape: {df.shape}")
df = pd.read_csv(
csv_file, encoding=encoding, on_bad_lines="skip"
)
print(
f"🔍 DEBUG: CSV loaded with {encoding}, shape: {df.shape}"
)
break
except (UnicodeDecodeError, UnicodeError, pd.errors.ParserError):
except (
UnicodeDecodeError,
UnicodeError,
pd.errors.ParserError,
):
continue
except Exception as e:
print(f"🔍 DEBUG: Error reading with {encoding}: {e}")
continue
if df is None:
print(f"Warning: Could not read CSV file {csv_file} with any encoding")
print(
f"Warning: Could not read CSV file {csv_file} with any encoding"
)
continue
# Clean column names from BOM and unicode artifacts
df.columns = [col.replace('\ufeff', '').replace('\x00', '').strip() for col in df.columns]
df.columns = [
col.replace("\ufeff", "").replace("\x00", "").strip()
for col in df.columns
]
# Convert timestamp to datetime with flexible parsing
print(f"🔍 DEBUG: Converting timestamps...")
timestamp_col = None
for col in df.columns:
if 'timestamp' in col.lower():
if "timestamp" in col.lower():
timestamp_col = col
break
if timestamp_col is None:
print(f"🔍 DEBUG: No timestamp column found in {csv_file}, skipping")
print(
f"🔍 DEBUG: No timestamp column found in {csv_file}, skipping"
)
continue
try:
# Try multiple timestamp formats
df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce')
df[timestamp_col] = pd.to_datetime(
df[timestamp_col], errors="coerce"
)
# Remove rows with invalid timestamps
df = df.dropna(subset=[timestamp_col])
if df.empty:
print(f"🔍 DEBUG: No valid timestamps in {csv_file}, skipping")
print(
f"🔍 DEBUG: No valid timestamps in {csv_file}, skipping"
)
continue
# Normalize column name to 'timestamp'
if timestamp_col != 'timestamp':
df = df.rename(columns={timestamp_col: 'timestamp'})
if timestamp_col != "timestamp":
df = df.rename(columns={timestamp_col: "timestamp"})
print(
f"🔍 DEBUG: Timestamp range: {df['timestamp'].min()} to {df['timestamp'].max()}"
)
print(f"🔍 DEBUG: Filter range: {start_time} to {end_time}")
except Exception as e:
print(f"🔍 DEBUG: Timestamp conversion failed for {csv_file}: {e}")
print(
f"🔍 DEBUG: Timestamp conversion failed for {csv_file}: {e}"
)
continue
# Recalculate matching variables after column cleaning
clean_headers = list(df.columns)
matching_vars = [var for var in variables if var in clean_headers]
print(f"🔍 DEBUG: Matching variables after cleaning: {matching_vars}")
print(
f"🔍 DEBUG: Matching variables after cleaning: {matching_vars}"
)
if not matching_vars:
print(f"🔍 DEBUG: No matching variables after cleaning in {csv_file}, skipping")
print(
f"🔍 DEBUG: No matching variables after cleaning in {csv_file}, skipping"
)
continue
# Filter by time range
@ -2069,7 +2104,9 @@ def get_historical_data():
)
except Exception as e:
# Skip invalid values
print(f"🔍 DEBUG: Skipping invalid value for {var}: {e}")
print(
f"🔍 DEBUG: Skipping invalid value for {var}: {e}"
)
continue
except Exception as e:

View File

@ -3,11 +3,11 @@
"should_connect": true,
"should_stream": false,
"active_datasets": [
"Test",
"DAR",
"Fast",
"DAR"
"Test"
]
},
"auto_recovery_enabled": true,
"last_update": "2025-08-15T00:26:15.383017"
"last_update": "2025-08-15T00:35:28.773668"
}

View File

@ -25,228 +25,254 @@ from datetime import datetime
class CSVValidator:
"""Validates and cleans CSV files for the PLC streaming system"""
def __init__(self):
self.encodings_to_try = ['utf-8', 'utf-8-sig', 'utf-16', 'latin-1', 'cp1252']
self.encodings_to_try = ["utf-8", "utf-8-sig", "utf-16", "latin-1", "cp1252"]
self.issues_found = []
self.fixed_files = []
def detect_encoding(self, file_path):
"""Detect the encoding of a CSV file"""
for encoding in self.encodings_to_try:
try:
with open(file_path, 'r', encoding=encoding) as f:
with open(file_path, "r", encoding=encoding) as f:
f.read(1024) # Read first 1KB
return encoding
except (UnicodeDecodeError, UnicodeError):
continue
return None
def read_csv_headers(self, file_path):
"""Read CSV headers with encoding detection"""
encoding = self.detect_encoding(file_path)
if not encoding:
return None, None, "Could not detect encoding"
try:
with open(file_path, 'r', encoding=encoding) as f:
with open(file_path, "r", encoding=encoding) as f:
header_line = f.readline().strip()
if not header_line:
return None, encoding, "Empty header line"
# Clean header from BOM and unicode artifacts
header_line = header_line.replace('\ufeff', '').replace('\x00', '')
headers = [h.strip().replace('\x00', '') for h in header_line.split(',')]
header_line = header_line.replace("\ufeff", "").replace("\x00", "")
headers = [
h.strip().replace("\x00", "") for h in header_line.split(",")
]
headers = [h for h in headers if h and len(h.strip()) > 0]
return headers, encoding, None
except Exception as e:
return None, encoding, str(e)
def validate_csv_structure(self, file_path):
"""Validate CSV structure and detect issues"""
issues = []
# Check headers
headers, encoding, header_error = self.read_csv_headers(file_path)
if header_error:
issues.append({
'type': 'header_error',
'message': header_error,
'file': file_path
})
issues.append(
{"type": "header_error", "message": header_error, "file": file_path}
)
return issues
if not headers:
issues.append({
'type': 'no_headers',
'message': 'No valid headers found',
'file': file_path
})
issues.append(
{
"type": "no_headers",
"message": "No valid headers found",
"file": file_path,
}
)
return issues
# Check for encoding issues in headers
if any('\x00' in h or 'ÿþ' in h or '' in h for h in headers):
issues.append({
'type': 'encoding_artifacts',
'message': f'Headers contain encoding artifacts: {headers}',
'file': file_path,
'encoding': encoding
})
if any("\x00" in h or "ÿþ" in h or "" in h for h in headers):
issues.append(
{
"type": "encoding_artifacts",
"message": f"Headers contain encoding artifacts: {headers}",
"file": file_path,
"encoding": encoding,
}
)
# Try to read full CSV with pandas
try:
df = None
for enc in self.encodings_to_try:
try:
df = pd.read_csv(file_path, encoding=enc, on_bad_lines='skip')
df = pd.read_csv(file_path, encoding=enc, on_bad_lines="skip")
break
except (UnicodeDecodeError, UnicodeError, pd.errors.ParserError):
continue
if df is None:
issues.append({
'type': 'read_error',
'message': 'Could not read CSV with any encoding',
'file': file_path
})
issues.append(
{
"type": "read_error",
"message": "Could not read CSV with any encoding",
"file": file_path,
}
)
return issues
# Check for inconsistent columns
expected_cols = len(headers)
if len(df.columns) != expected_cols:
issues.append({
'type': 'column_mismatch',
'message': f'Expected {expected_cols} columns, found {len(df.columns)}',
'file': file_path,
'expected_headers': headers,
'actual_headers': list(df.columns)
})
issues.append(
{
"type": "column_mismatch",
"message": f"Expected {expected_cols} columns, found {len(df.columns)}",
"file": file_path,
"expected_headers": headers,
"actual_headers": list(df.columns),
}
)
# Check for timestamp column
timestamp_cols = [col for col in df.columns if 'timestamp' in col.lower()]
timestamp_cols = [col for col in df.columns if "timestamp" in col.lower()]
if not timestamp_cols:
issues.append({
'type': 'no_timestamp',
'message': 'No timestamp column found',
'file': file_path,
'columns': list(df.columns)
})
issues.append(
{
"type": "no_timestamp",
"message": "No timestamp column found",
"file": file_path,
"columns": list(df.columns),
}
)
else:
# Validate timestamp format
timestamp_col = timestamp_cols[0]
try:
df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce')
df[timestamp_col] = pd.to_datetime(
df[timestamp_col], errors="coerce"
)
invalid_timestamps = df[timestamp_col].isna().sum()
if invalid_timestamps > 0:
issues.append({
'type': 'invalid_timestamps',
'message': f'{invalid_timestamps} invalid timestamps found',
'file': file_path,
'timestamp_column': timestamp_col
})
issues.append(
{
"type": "invalid_timestamps",
"message": f"{invalid_timestamps} invalid timestamps found",
"file": file_path,
"timestamp_column": timestamp_col,
}
)
except Exception as e:
issues.append({
'type': 'timestamp_parse_error',
'message': f'Cannot parse timestamps: {str(e)}',
'file': file_path,
'timestamp_column': timestamp_col
})
issues.append(
{
"type": "timestamp_parse_error",
"message": f"Cannot parse timestamps: {str(e)}",
"file": file_path,
"timestamp_column": timestamp_col,
}
)
except Exception as e:
issues.append({
'type': 'general_error',
'message': f'Error reading CSV: {str(e)}',
'file': file_path
})
issues.append(
{
"type": "general_error",
"message": f"Error reading CSV: {str(e)}",
"file": file_path,
}
)
return issues
def scan_directory(self, directory_path):
"""Scan directory for CSV issues"""
print(f"🔍 Scanning directory: {directory_path}")
csv_files = glob.glob(os.path.join(directory_path, "**/*.csv"), recursive=True)
total_files = len(csv_files)
print(f"📁 Found {total_files} CSV files")
all_issues = []
for i, csv_file in enumerate(csv_files, 1):
print(f"📄 Checking {i}/{total_files}: {os.path.basename(csv_file)}")
issues = self.validate_csv_structure(csv_file)
if issues:
all_issues.extend(issues)
print(f" ⚠️ {len(issues)} issues found")
else:
print(f" ✅ OK")
return all_issues
def fix_csv_file(self, file_path, backup=True):
"""Fix a problematic CSV file"""
print(f"🔧 Fixing CSV file: {file_path}")
if backup:
backup_path = f"{file_path}.backup.{datetime.now().strftime('%Y%m%d_%H%M%S')}"
backup_path = (
f"{file_path}.backup.{datetime.now().strftime('%Y%m%d_%H%M%S')}"
)
import shutil
shutil.copy2(file_path, backup_path)
print(f"📋 Backup created: {backup_path}")
# Detect encoding and read file
encoding = self.detect_encoding(file_path)
if not encoding:
print("❌ Could not detect encoding")
return False
try:
# Read with detected encoding
df = pd.read_csv(file_path, encoding=encoding, on_bad_lines='skip')
df = pd.read_csv(file_path, encoding=encoding, on_bad_lines="skip")
# Clean column names
df.columns = [col.replace('\ufeff', '').replace('\x00', '').strip() for col in df.columns]
df.columns = [
col.replace("\ufeff", "").replace("\x00", "").strip()
for col in df.columns
]
# Find timestamp column
timestamp_cols = [col for col in df.columns if 'timestamp' in col.lower()]
timestamp_cols = [col for col in df.columns if "timestamp" in col.lower()]
if timestamp_cols:
timestamp_col = timestamp_cols[0]
# Normalize to 'timestamp'
if timestamp_col != 'timestamp':
df = df.rename(columns={timestamp_col: 'timestamp'})
if timestamp_col != "timestamp":
df = df.rename(columns={timestamp_col: "timestamp"})
# Fix timestamps
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
# Remove rows with invalid timestamps
df = df.dropna(subset=['timestamp'])
df = df.dropna(subset=["timestamp"])
# Write fixed file with UTF-8 encoding
df.to_csv(file_path, index=False, encoding='utf-8')
df.to_csv(file_path, index=False, encoding="utf-8")
print(f"✅ Fixed and saved with UTF-8 encoding")
return True
except Exception as e:
print(f"❌ Error fixing file: {e}")
return False
def print_issue_summary(self, issues):
"""Print a summary of issues found"""
if not issues:
print("\n🎉 No issues found!")
return
print(f"\n📊 Summary: {len(issues)} issues found")
issue_types = {}
for issue in issues:
issue_type = issue['type']
issue_type = issue["type"]
if issue_type not in issue_types:
issue_types[issue_type] = []
issue_types[issue_type].append(issue)
for issue_type, type_issues in issue_types.items():
print(f"\n🔸 {issue_type.replace('_', ' ').title()}: {len(type_issues)} files")
print(
f"\n🔸 {issue_type.replace('_', ' ').title()}: {len(type_issues)} files"
)
for issue in type_issues[:5]: # Show first 5
print(f" - {os.path.basename(issue['file'])}: {issue['message']}")
if len(type_issues) > 5:
@ -254,41 +280,53 @@ class CSVValidator:
def main():
parser = argparse.ArgumentParser(description='CSV Validator and Cleaner for PLC Streaming System')
parser.add_argument('path', help='Path to CSV file or directory to process')
parser.add_argument('--scan', action='store_true', help='Scan directory for issues (default)')
parser.add_argument('--fix', action='store_true', help='Fix individual CSV file')
parser.add_argument('--fix-all', action='store_true', help='Fix all problematic files in directory')
parser.add_argument('--no-backup', action='store_true', help='Do not create backup when fixing')
parser = argparse.ArgumentParser(
description="CSV Validator and Cleaner for PLC Streaming System"
)
parser.add_argument("path", help="Path to CSV file or directory to process")
parser.add_argument(
"--scan", action="store_true", help="Scan directory for issues (default)"
)
parser.add_argument("--fix", action="store_true", help="Fix individual CSV file")
parser.add_argument(
"--fix-all", action="store_true", help="Fix all problematic files in directory"
)
parser.add_argument(
"--no-backup", action="store_true", help="Do not create backup when fixing"
)
args = parser.parse_args()
validator = CSVValidator()
if args.fix and os.path.isfile(args.path):
# Fix single file
success = validator.fix_csv_file(args.path, backup=not args.no_backup)
sys.exit(0 if success else 1)
elif os.path.isdir(args.path):
# Scan directory
issues = validator.scan_directory(args.path)
validator.print_issue_summary(issues)
if args.fix_all and issues:
print(f"\n🔧 Fixing {len(set(issue['file'] for issue in issues))} problematic files...")
problematic_files = set(issue['file'] for issue in issues if issue['type'] != 'no_timestamp')
print(
f"\n🔧 Fixing {len(set(issue['file'] for issue in issues))} problematic files..."
)
problematic_files = set(
issue["file"] for issue in issues if issue["type"] != "no_timestamp"
)
fixed_count = 0
for file_path in problematic_files:
if validator.fix_csv_file(file_path, backup=not args.no_backup):
fixed_count += 1
print(f"✅ Fixed {fixed_count}/{len(problematic_files)} files")
sys.exit(1 if issues else 0)
else:
print(f"❌ Path not found: {args.path}")
sys.exit(1)