Files
Jackify/jackify/backend/handlers/engine_monitor.py
Omni cd591c14e3 Initial public release v0.1.0 - Linux Wabbajack Modlist Application
Jackify provides native Linux support for Wabbajack modlist installation
   and management with automated Steam integration and Proton configuration.

   Key Features:
   - Almost Native Linux implementation (texconv.exe run via proton)
   - Automated Steam shortcut creation and Proton prefix management
   - Both CLI and GUI interfaces, with Steam Deck optimization

   Supported Games:
   - Skyrim Special Edition
   - Fallout 4
   - Fallout New Vegas
   - Oblivion, Starfield, Enderal, and diverse other games

   Technical Architecture:
   - Clean separation between frontend and backend services
   - Powered by jackify-engine 0.3.x for Wabbajack-matching modlist installation
2025-09-05 20:46:24 +01:00

338 lines
14 KiB
Python

"""
Engine Performance Monitor
Monitors the jackify-engine process for performance issues like CPU stalls,
memory problems, and excessive I/O wait times.
"""
import time
import threading
import psutil
import logging
import os
from typing import Optional, Dict, Any, Callable
from dataclasses import dataclass
from enum import Enum
class PerformanceState(Enum):
NORMAL = "normal"
STALLED = "stalled"
HIGH_MEMORY = "high_memory"
HIGH_IO_WAIT = "high_io_wait"
ZOMBIE = "zombie"
@dataclass
class PerformanceMetrics:
timestamp: float
cpu_percent: float
memory_percent: float
memory_mb: float
io_read_mb: float
io_write_mb: float
thread_count: int
fd_count: int
state: PerformanceState
# Additional diagnostics for engine vs wrapper distinction
parent_cpu_percent: Optional[float] = None
parent_memory_mb: Optional[float] = None
engine_responsive: bool = True
# New: ImageMagick resource usage
magick_cpu_percent: float = 0.0
magick_memory_mb: float = 0.0
class EnginePerformanceMonitor:
"""
Monitors jackify-engine process performance and detects common stall patterns.
This is designed to help diagnose the issue where extraction starts at 80-100% CPU
but drops to 2% after ~5 minutes and requires manual kills.
Also monitors parent Python process to distinguish between engine vs wrapper issues.
"""
def __init__(self,
logger: Optional[logging.Logger] = None,
stall_threshold: float = 5.0, # CPU below this % for stall_duration = stall
stall_duration: float = 120.0, # seconds of low CPU = stall
memory_threshold: float = 85.0, # % memory usage threshold
sample_interval: float = 5.0): # seconds between samples
self.logger = logger or logging.getLogger(__name__)
self.stall_threshold = stall_threshold
self.stall_duration = stall_duration
self.memory_threshold = memory_threshold
self.sample_interval = sample_interval
self._process: Optional[psutil.Process] = None
self._parent_process: Optional[psutil.Process] = None
self._monitoring = False
self._monitor_thread: Optional[threading.Thread] = None
self._metrics_history: list[PerformanceMetrics] = []
self._callbacks: list[Callable[[PerformanceMetrics], None]] = []
# Performance state tracking
self._low_cpu_start_time: Optional[float] = None
self._last_io_read = 0
self._last_io_write = 0
def add_callback(self, callback: Callable[[PerformanceMetrics], None]):
"""Add a callback to receive performance metrics updates."""
self._callbacks.append(callback)
def start_monitoring(self, pid: int) -> bool:
"""Start monitoring the given process ID."""
try:
self._process = psutil.Process(pid)
# Also monitor the parent Python process for comparison
try:
self._parent_process = psutil.Process(os.getpid())
except:
self._parent_process = None
self._monitoring = True
self._monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
self._monitor_thread.start()
process_name = self._process.name() if self._process else "unknown"
self.logger.info(f"Started performance monitoring for PID {pid} ({process_name}) "
f"(stall threshold: {self.stall_threshold}% CPU for {self.stall_duration}s)")
return True
except psutil.NoSuchProcess:
self.logger.error(f"Process {pid} not found")
return False
except Exception as e:
self.logger.error(f"Failed to start monitoring PID {pid}: {e}")
return False
def stop_monitoring(self):
"""Stop monitoring the process."""
self._monitoring = False
if self._monitor_thread and self._monitor_thread.is_alive():
self._monitor_thread.join(timeout=10)
def get_metrics_summary(self) -> Dict[str, Any]:
"""Get a summary of collected metrics."""
if not self._metrics_history:
return {}
cpu_values = [m.cpu_percent for m in self._metrics_history]
memory_values = [m.memory_mb for m in self._metrics_history]
stalled_count = sum(1 for m in self._metrics_history if m.state == PerformanceState.STALLED)
# Engine vs wrapper analysis
engine_avg_cpu = sum(cpu_values) / len(cpu_values)
parent_cpu_values = [m.parent_cpu_percent for m in self._metrics_history if m.parent_cpu_percent is not None]
parent_avg_cpu = sum(parent_cpu_values) / len(parent_cpu_values) if parent_cpu_values else 0
return {
"total_samples": len(self._metrics_history),
"monitoring_duration": self._metrics_history[-1].timestamp - self._metrics_history[0].timestamp,
# Engine process metrics
"engine_avg_cpu_percent": engine_avg_cpu,
"engine_max_cpu_percent": max(cpu_values),
"engine_min_cpu_percent": min(cpu_values),
"engine_avg_memory_mb": sum(memory_values) / len(memory_values),
"engine_max_memory_mb": max(memory_values),
# Parent process metrics (for comparison)
"parent_avg_cpu_percent": parent_avg_cpu,
# Stall analysis
"stalled_samples": stalled_count,
"stall_percentage": (stalled_count / len(self._metrics_history)) * 100,
# Diagnosis hints
"likely_engine_issue": engine_avg_cpu < 10 and parent_avg_cpu < 5,
"likely_wrapper_issue": engine_avg_cpu > 20 and parent_avg_cpu > 50,
}
def _monitor_loop(self):
"""Main monitoring loop."""
while self._monitoring:
try:
if not self._process or not self._process.is_running():
self.logger.warning("Monitored engine process is no longer running")
break
metrics = self._collect_metrics()
self._metrics_history.append(metrics)
# Notify callbacks
for callback in self._callbacks:
try:
callback(metrics)
except Exception as e:
self.logger.error(f"Error in performance callback: {e}")
# Log significant events with engine vs wrapper context
if metrics.state == PerformanceState.STALLED:
parent_info = ""
if metrics.parent_cpu_percent is not None:
parent_info = f", Python wrapper: {metrics.parent_cpu_percent:.1f}% CPU"
self.logger.warning(f"🚨 ENGINE STALL DETECTED: jackify-engine CPU at {metrics.cpu_percent:.1f}% "
f"for {self.stall_duration}s+ (Memory: {metrics.memory_mb:.1f}MB, "
f"Threads: {metrics.thread_count}, FDs: {metrics.fd_count}{parent_info})")
# Provide diagnosis hint
if metrics.parent_cpu_percent and metrics.parent_cpu_percent > 10:
self.logger.warning("Warning: Python wrapper still active - likely jackify-engine (.NET) issue")
else:
self.logger.warning("Warning: Both processes low CPU - possible system-wide issue")
elif metrics.state == PerformanceState.HIGH_MEMORY:
self.logger.warning(f"HIGH MEMORY USAGE in jackify-engine: {metrics.memory_percent:.1f}% "
f"({metrics.memory_mb:.1f}MB)")
time.sleep(self.sample_interval)
except psutil.NoSuchProcess:
self.logger.info("Monitored engine process terminated")
break
except Exception as e:
self.logger.error(f"Error in monitoring loop: {e}")
time.sleep(self.sample_interval)
def _collect_metrics(self) -> PerformanceMetrics:
"""Collect current performance metrics."""
now = time.time()
# Get basic process info for engine
cpu_percent = self._process.cpu_percent()
memory_info = self._process.memory_info()
memory_mb = memory_info.rss / (1024 * 1024)
memory_percent = self._process.memory_percent()
# Get parent process info for comparison
parent_cpu_percent = None
parent_memory_mb = None
if self._parent_process:
try:
parent_cpu_percent = self._parent_process.cpu_percent()
parent_memory_info = self._parent_process.memory_info()
parent_memory_mb = parent_memory_info.rss / (1024 * 1024)
except:
pass
# Get I/O info
try:
io_counters = self._process.io_counters()
io_read_mb = io_counters.read_bytes / (1024 * 1024)
io_write_mb = io_counters.write_bytes / (1024 * 1024)
except (psutil.AccessDenied, AttributeError):
io_read_mb = 0
io_write_mb = 0
# Get thread and file descriptor counts
try:
thread_count = self._process.num_threads()
except (psutil.AccessDenied, AttributeError):
thread_count = 0
try:
fd_count = self._process.num_fds()
except (psutil.AccessDenied, AttributeError):
fd_count = 0
# Determine performance state
state = self._determine_state(cpu_percent, memory_percent, now)
# New: Aggregate ImageMagick ('magick') child process usage
magick_cpu = 0.0
magick_mem = 0.0
try:
for child in self._process.children(recursive=True):
try:
if child.name() == 'magick' or 'magick' in ' '.join(child.cmdline()):
magick_cpu += child.cpu_percent()
magick_mem += child.memory_info().rss / (1024 * 1024)
except Exception:
continue
except Exception:
pass
return PerformanceMetrics(
timestamp=now,
cpu_percent=cpu_percent,
memory_percent=memory_percent,
memory_mb=memory_mb,
io_read_mb=io_read_mb,
io_write_mb=io_write_mb,
thread_count=thread_count,
fd_count=fd_count,
state=state,
parent_cpu_percent=parent_cpu_percent,
parent_memory_mb=parent_memory_mb,
engine_responsive=cpu_percent > self.stall_threshold or (now - self._low_cpu_start_time if self._low_cpu_start_time else 0) < self.stall_duration,
magick_cpu_percent=magick_cpu,
magick_memory_mb=magick_mem
)
def _determine_state(self, cpu_percent: float, memory_percent: float, timestamp: float) -> PerformanceState:
"""Determine the current performance state."""
# Check for high memory usage
if memory_percent > self.memory_threshold:
return PerformanceState.HIGH_MEMORY
# Check for CPU stall
if cpu_percent < self.stall_threshold:
if self._low_cpu_start_time is None:
self._low_cpu_start_time = timestamp
elif timestamp - self._low_cpu_start_time >= self.stall_duration:
return PerformanceState.STALLED
else:
# CPU is above threshold, reset stall timer
self._low_cpu_start_time = None
return PerformanceState.NORMAL
def create_debug_callback(logger: logging.Logger) -> Callable[[PerformanceMetrics], None]:
"""Create a callback that logs detailed performance metrics for debugging."""
def debug_callback(metrics: PerformanceMetrics):
parent_info = f", Python: {metrics.parent_cpu_percent:.1f}%" if metrics.parent_cpu_percent else ""
magick_info = f", Magick: {metrics.magick_cpu_percent:.1f}% CPU, {metrics.magick_memory_mb:.1f}MB RAM" if metrics.magick_cpu_percent or metrics.magick_memory_mb else ""
logger.debug(f"Engine Performance: jackify-engine CPU={metrics.cpu_percent:.1f}%, "
f"Memory={metrics.memory_mb:.1f}MB ({metrics.memory_percent:.1f}%), "
f"Threads={metrics.thread_count}, FDs={metrics.fd_count}, "
f"State={metrics.state.value}{parent_info}{magick_info}")
return debug_callback
def create_stall_alert_callback(logger: logging.Logger,
alert_func: Optional[Callable[[str], None]] = None
) -> Callable[[PerformanceMetrics], None]:
"""Create a callback that alerts when performance issues are detected."""
def alert_callback(metrics: PerformanceMetrics):
if metrics.state in [PerformanceState.STALLED, PerformanceState.HIGH_MEMORY]:
# Provide context about engine vs wrapper
if metrics.state == PerformanceState.STALLED:
if metrics.parent_cpu_percent and metrics.parent_cpu_percent > 10:
issue_type = "jackify-engine (.NET binary) stalled"
else:
issue_type = "system-wide performance issue"
else:
issue_type = metrics.state.value.upper()
message = (f"{issue_type} - Engine CPU: {metrics.cpu_percent:.1f}%, "
f"Memory: {metrics.memory_mb:.1f}MB")
logger.warning(message)
if alert_func:
alert_func(message)
return alert_callback