Quick Start
Get started building modules for the Nexus-1 platform in minutes.
1. Install the SDK
# Install via NuGet Package Manager
dotnet add package Nexus.SDK.Client
# Or via Package Manager Console
Install-Package Nexus.SDK.Client
# Or add to .csproj file
<PackageReference Include="Nexus.SDK.Client" Version="1.0.0" />
This will automatically install the minimal required dependencies (Microsoft.Extensions abstractions).
# Install via pip
pip install nexus-sdk-client
# Or install from requirements.txt
echo "nexus-sdk-client>=1.0.0" >> requirements.txt
pip install -r requirements.txt
# For development installation
pip install -e nexus-sdk-client[dev]
The Python SDK provides async/await support and type hints for better development experience.
# Using CMake
# Add to CMakeLists.txt:
find_package(NexusSDKClient REQUIRED)
target_link_libraries(your_module PRIVATE NexusSDKClient::Core)
# Using vcpkg
vcpkg install nexus-sdk-client
# Using Conan
conan install nexus-sdk-client/1.0.0@
The C++ SDK provides header-only interfaces with modern C++17 support.
% Option 1: Add to MATLAB path
addpath('/path/to/nexus-sdk-client-matlab')
savepath % Save for future sessions
% Option 2: Install from MATLAB File Exchange
% Search for "NEXUS-1 SDK Client" in File Exchange
% Option 3: Install from Add-On Explorer
% Home tab > Add-Ons > Get Add-Ons
% Search for "NEXUS-1 SDK Client for MATLAB"
MATLAB SDK provides object-oriented interfaces compatible with MATLAB R2019b and later.
% Option 1: VI Package Manager (VIPM)
% 1. Open VI Package Manager
% 2. Search for "NEXUS-1 SDK Client"
% 3. Click Install
% Option 2: Manual Installation
% 1. Download from ni.com/tools/nexus-sdk-client
% 2. Extract to LabVIEW/user.lib/
% 3. Restart LabVIEW
% Option 3: LabVIEW Tools Network
% Tools > VI Package Manager > Browse
LabVIEW SDK provides VIs and type definitions for LabVIEW 2019 and later.
2. Create Your Module
Module Capabilities
Nexus uses a capability-based security model where modules must declare their required capabilities in the manifest file. This ensures modules only access resources and APIs they need.
How Capabilities Work
When the Nexus host loads your module, it:
- Reads the capabilities declared in the manifest file
- Validates that required capabilities are available in the system
- Grants only the declared capabilities to the module
- Enforces capability checks at runtime for sensitive operations
- Provides the granted capabilities via the module context
Common System Capabilities
READ_SENSORS
- Read sensor dataWRITE_CONTROLS
- Write to control systemsPUBLISH_TELEMETRY
- Publish telemetry dataSTORAGE_READ
- Read from persistent storageSTORAGE_WRITE
- Write to persistent storageNETWORK_ACCESS
- Make network requestsHARDWARE_ACCESS
- Access hardware devicesSYSTEM_CONFIG
- Read system configuration
using Nexus.Contracts;
using Nexus.SDK.Client;
using Microsoft.Extensions.Logging;
// Simple module that inherits from ModuleBase
public class TemperatureMonitor : ModuleBase
{
private Timer? _timer;
private double _threshold = 75.0;
// Called by host during initialization
protected override async Task OnInitializeAsync()
{
// Logger is automatically available from ModuleBase
Logger.LogInformation("Temperature monitor initializing");
// Access configuration from manifest
_threshold = Configuration.GetValue("threshold", 75.0);
// Module capabilities are defined in the manifest file
// The host ensures we only have access to declared capabilities
Logger.LogInformation("Temperature threshold set to {Threshold}°C", _threshold);
await Task.CompletedTask;
}
// Called by host when module should start
public override async Task StartAsync(CancellationToken cancellationToken)
{
await base.StartAsync(cancellationToken);
// Subscribe to temperature threshold commands
await MessageBus.Subscribe(HandleThresholdCommand);
// Start monitoring
_timer = new Timer(CheckTemperature, null, TimeSpan.Zero, TimeSpan.FromSeconds(5));
}
private async Task HandleThresholdCommand(SetThresholdCommand command)
{
Logger.LogInformation("Threshold set to {Threshold}", command.Threshold);
// Update threshold...
}
private async void CheckTemperature(object? state)
{
// Read temperature (simulated)
var temp = ReadTemperature();
// Check against threshold
if (temp > _threshold)
{
Logger.LogWarning("Temperature {Temp}°C exceeds threshold {Threshold}°C", temp, _threshold);
}
// Publish telemetry
// The host ensures we can only publish if we have the PUBLISH_TELEMETRY capability
await MessageBus.PublishAsync("telemetry/temperature", new TemperatureReading
{
Value = temp,
Timestamp = DateTime.UtcNow
});
}
public override async Task StopAsync(CancellationToken cancellationToken)
{
_timer?.Dispose();
await base.StopAsync(cancellationToken);
}
}
// Message contracts using SDK base classes
public class SetThresholdCommand : RequestMessage
{
public double Threshold { get; set; }
}
public class TemperatureReading : EventMessage
{
public double Value { get; set; }
}
from nexus_sdk import Module, RequestMessage, EventMessage, module
import asyncio
from datetime import datetime
class TemperatureMonitor(Module):
def __init__(self):
super().__init__()
self._timer_task = None
self._threshold = 75.0
async def on_initialize(self):
"""Called by host during initialization"""
self.logger.info("Temperature monitor initializing")
# Access configuration from manifest
self._threshold = self.config.get("threshold", 75.0)
# Module capabilities are defined in the manifest file
# The host ensures we only have access to declared capabilities
self.logger.info(f"Temperature threshold set to {self._threshold}°C")
async def start(self):
"""Called by host when module should start"""
# Subscribe to temperature threshold commands
await self.message_bus.subscribe(
SetThresholdCommand,
self.handle_threshold_command
)
# Start monitoring
self._timer_task = asyncio.create_task(self.monitor_temperature())
async def handle_threshold_command(self, command: SetThresholdCommand):
self.logger.info(f"Threshold set to {command.threshold}")
# Update threshold...
async def monitor_temperature(self):
while True:
# Read temperature (simulated)
temp = self.read_temperature()
# Publish reading
await self.message_bus.publish(
"telemetry/temperature",
TemperatureReading(value=temp)
)
await asyncio.sleep(5)
async def stop(self):
"""Called by host when module should stop"""
if self._timer_task:
self._timer_task.cancel()
# Message contracts using SDK base classes
class SetThresholdCommand(RequestMessage):
threshold: float
class TemperatureReading(EventMessage):
value: float
#include
#include
#include
using namespace nexus::sdk;
// Message contracts using SDK base classes
struct SetThresholdCommand : public RequestMessage {
double threshold;
};
struct TemperatureReading : public EventMessage {
double value;
};
class TemperatureMonitor : public ModuleBase {
public:
TemperatureMonitor() : ModuleBase("temperature-monitor", "1.0.0") {}
protected:
// Called by host during initialization
async::task OnInitializeAsync() override {
logger()->info("Temperature monitor initializing");
// Access capabilities granted in manifest
auto capabilities = module_info().capabilities;
logger()->info("Granted capabilities: {}", fmt::join(capabilities, ", "));
co_return;
}
// Called by host when module should start
async::task StartAsync(CancellationToken token) override {
co_await ModuleBase::StartAsync(token);
// Subscribe to temperature threshold commands
co_await message_bus()->Subscribe(
[this](const SetThresholdCommand& cmd) -> async::task {
co_await HandleThresholdCommand(cmd);
});
// Start monitoring
monitor_task_ = MonitorTemperature(token);
}
private:
async::task HandleThresholdCommand(const SetThresholdCommand& command) {
logger()->info("Threshold set to {}", command.threshold);
threshold_ = command.threshold;
co_return;
}
async::task MonitorTemperature(CancellationToken token) {
while (!token.is_cancellation_requested()) {
// Read temperature (simulated)
auto temp = ReadTemperature();
// Publish reading
co_await message_bus()->PublishAsync("telemetry/temperature",
TemperatureReading{ .value = temp });
co_await async::sleep_for(std::chrono::seconds(5));
}
}
double ReadTemperature() { return 23.5; } // Simulated
double threshold_ = 30.0;
async::task monitor_task_;
};
// Register the module
NEXUS_MODULE_EXPORT(TemperatureMonitor)
classdef TemperatureMonitor < nexus.sdk.ModuleBase
% Temperature monitoring module using transport-agnostic SDK
properties (Constant)
ModuleId = 'temperature-monitor'
Version = '1.0.0'
end
properties (Access = private)
threshold = 30.0
timerObj
end
methods
function obj = TemperatureMonitor()
obj = obj@nexus.sdk.ModuleBase();
end
% Called by host during initialization
function onInitialize(obj)
obj.logger.info('Temperature monitor initializing');
% Access capabilities granted in manifest
capabilities = obj.moduleInfo.capabilities;
obj.logger.info(sprintf('Granted capabilities: %s', ...
strjoin(capabilities, ', ')));
end
% Called by host when module should start
function start(obj, cancellationToken)
start@nexus.sdk.ModuleBase(obj, cancellationToken);
% Subscribe to temperature threshold commands
obj.messageBus.subscribe('SetThresholdCommand', ...
@(cmd) obj.handleThresholdCommand(cmd));
% Start monitoring
obj.timerObj = timer('Period', 5, ...
'ExecutionMode', 'fixedRate', ...
'TimerFcn', @(~,~) obj.checkTemperature());
start(obj.timerObj);
end
% Called by host when module should stop
function stop(obj, cancellationToken)
if ~isempty(obj.timerObj)
stop(obj.timerObj);
delete(obj.timerObj);
end
stop@nexus.sdk.ModuleBase(obj, cancellationToken);
end
end
methods (Access = private)
function handleThresholdCommand(obj, command)
obj.logger.info(sprintf('Threshold set to %.2f', command.threshold));
obj.threshold = command.threshold;
end
function checkTemperature(obj)
% Read temperature (simulated)
temp = obj.readTemperature();
% Create reading using SDK message base class
reading = nexus.sdk.EventMessage();
reading.value = temp;
% Publish reading
obj.messageBus.publish('telemetry/temperature', reading);
end
function temp = readTemperature(obj)
temp = 20 + 10*rand(); % Simulated
end
end
end
% Message contracts
classdef SetThresholdCommand < nexus.sdk.RequestMessage
properties
threshold double
end
end
classdef TemperatureReading < nexus.sdk.EventMessage
properties
value double
end
end
// LabVIEW Temperature Monitor Module
// Uses Nexus SDK Client interfaces (transport-agnostic)
// Module Library: TemperatureMonitor.lvlib
// Properties:
// - Name: "Temperature Monitor"
// - Version: "1.0.0"
// Main VI: TemperatureMonitor.lvclass
// 1. Initialize.vi (Override from ModuleBase)
// Block Diagram:
// - Call Parent Initialize
// - Log "Temperature monitor initializing"
// - Get Module Info from Context
// - Extract Capabilities array
// - Log "Granted capabilities: [list]"
// - Set module ID = "temperature-monitor"
// 2. Start.vi (Override from ModuleBase)
// Block Diagram:
// - Call Parent Start
// - Subscribe to Commands:
// * Topic Pattern: "SetThresholdCommand"
// * Handler VI: HandleThresholdCommand.vi
// - Start Temperature Loop:
// * Create timed loop (5000ms period)
// * Call CheckTemperature.vi
// 3. HandleThresholdCommand.vi
// Inputs: Command (SetThresholdCommand.ctl)
// Block Diagram:
// - Extract threshold from command
// - Update module threshold property
// - Log "Threshold set to [value]"
// 4. CheckTemperature.vi
// Block Diagram:
// - Read temperature (simulated)
// - Create TemperatureReading:
// * Use EventMessage base class
// * Set value field
// - Publish to "telemetry/temperature"
// Message Type Definitions:
// SetThresholdCommand.ctl (inherits RequestMessage.ctl)
// - threshold (DBL)
// TemperatureReading.ctl (inherits EventMessage.ctl)
// - value (DBL)
// Key Points:
// - All transport handled by host
// - Use SDK base classes for messages
// - Module only implements business logic
// - No knowledge of gRPC or other transport
Understanding Capabilities and Priority-Based Routing
Capabilities are security permissions defined in the manifest file, not in your module code. This design provides:
- Deployment Flexibility - Same module binary can have different permissions in different environments
- Security by Default - Modules have no permissions unless explicitly granted
- Runtime Inspection - Modules can check their granted capabilities at runtime
- Clean Architecture - Security concerns separated from business logic
Priority-Based Capability Routing
When multiple modules provide the same capability, the priority field determines routing:
- Multiple Providers - Several modules can offer the same capability with different priority levels
- Caller Requirements - Callers can specify minimum priority requirements (e.g., "I need READ_SENSOR with priority ≥ 200")
- Smart Routing - The host selects the lowest-priority module that still meets the requirement
- Fallback Support - If a high-priority module fails, the host can route to lower-priority alternatives
# Example: Multiple modules providing READ_SENSOR capability
modules:
- id: "high-precision-sensor"
capabilities: ["READ_SENSOR"]
priority: 900 # High-quality, expensive sensor
- id: "standard-sensor"
capabilities: ["READ_SENSOR"]
priority: 500 # Standard quality
- id: "backup-sensor"
capabilities: ["READ_SENSOR"]
priority: 100 # Low-quality fallback
# Caller requests:
# - Priority 50: Routes to backup-sensor (100 ≥ 50)
# - Priority 200: Routes to standard-sensor (500 ≥ 200)
# - Priority 950: No module available (900 < 950)
Using Capability-Based Routing in Code
The SDK provides APIs to leverage capability routing:
// Request service by capability with minimum priority
var response = await MessageBus.RequestByCapabilityAsync(
capability: "READ_SENSOR",
request: new SensorRequest { SensorId = "temp1" },
minimumPriority: 800, // Require high-quality sensor
timeout: TimeSpan.FromSeconds(5)
);
// Check capability availability before requesting
if (await MessageBus.IsCapabilityAvailableAsync("WRITE_PLC_DATA", minimumPriority: 500))
{
// Safe to make the request
}
// Discover available providers
var providers = await MessageBus.GetCapabilityProvidersAsync("READ_SENSOR");
foreach (var provider in providers.OrderByDescending(p => p.Priority))
{
Logger.LogInformation("{Name}: Priority {Priority}, Healthy: {Healthy}",
provider.ModuleName, provider.Priority, provider.IsHealthy);
}
// Get only high-priority providers (e.g., for critical operations)
var criticalProviders = await MessageBus.GetCapabilityProvidersAsync(
"READ_SENSOR",
minimumPriority: 700
);
This enables sophisticated patterns like quality-of-service tiers, graceful degradation, and cost optimization.
3. Build Your Module
dotnet build -c Release
# Create package
python -m build
# Or for development
pip install -e .
# Using CMake
cmake -B build -DCMAKE_BUILD_TYPE=Release
cmake --build build
# Or using make directly
make release
% No build step required for MATLAB
% Just ensure all files are in the module directory:
% - TemperatureMonitor.m (main class)
% - SetThresholdCommand.m
% - TemperatureReading.m
% Build steps in LabVIEW:
% 1. Open project in LabVIEW
% 2. Right-click Build Specifications
% 3. Select "New > Packed Library"
% 4. Configure:
% - Source Files: Add module VIs
% - Destinations: Set output path
% 5. Build
4. Deploy to Nexus Host
Manifest File Naming
The manifest file can have any name (e.g., nexus-manifest.yaml
, production.yaml
, dev-config.yaml
):
- If named
nexus-manifest.yaml
in the current directory, Nexus will use it by default - For any other name or location, use the
--manifest
argument when starting Nexus - This allows you to maintain multiple configurations (dev, staging, production)
- Copy your module DLL to the host's
modules/
directory - Add your module to
nexus-manifest.yaml
:modules: - id: "temperature-monitor" name: "Temperature Monitor" version: "1.0.0" language: "csharp" assembly: "modules/TemperatureMonitor.dll" priority: 100 # Optional: Module priority (0-1000, default: 500) capabilities: - "PUBLISH_TELEMETRY" - "READ_SENSORS"
- Start the Nexus host:
# Using default manifest name (nexus-manifest.yaml) dotnet Nexus.Host.dll # Using custom manifest name dotnet Nexus.Host.dll --manifest my-custom-manifest.yaml # Using manifest in different directory dotnet Nexus.Host.dll --manifest /path/to/config/production-manifest.yaml
- Copy your module package to the host's
modules/
directory - Add your module to
nexus-manifest.yaml
:modules: - id: "temperature-monitor" name: "Temperature Monitor" version: "1.0.0" language: "python" path: "modules/temperature_monitor" entry: "temperature_monitor.TemperatureMonitor" capabilities: - "PUBLISH_TELEMETRY" - "READ_SENSORS"
- Start the Nexus host:
# Using default manifest name (nexus-manifest.yaml) dotnet Nexus.Host.dll # Using custom manifest name dotnet Nexus.Host.dll --manifest my-custom-manifest.yaml # Using manifest in different directory dotnet Nexus.Host.dll --manifest /path/to/config/production-manifest.yaml
- Copy your module library to the host's
modules/
directory - Add your module to
nexus-manifest.yaml
:modules: - id: "temperature-monitor" name: "Temperature Monitor" version: "1.0.0" language: "cpp" library: "modules/libTemperatureMonitor.so" # Linux # library: "modules/TemperatureMonitor.dll" # Windows capabilities: - "PUBLISH_TELEMETRY" - "READ_SENSORS"
- Start the Nexus host:
# Using default manifest name (nexus-manifest.yaml) dotnet Nexus.Host.dll # Using custom manifest name dotnet Nexus.Host.dll --manifest my-custom-manifest.yaml # Using manifest in different directory dotnet Nexus.Host.dll --manifest /path/to/config/production-manifest.yaml
- Copy your module files to the host's
modules/
directory - Add your module to
nexus-manifest.yaml
:modules: - id: "temperature-monitor" name: "Temperature Monitor" version: "1.0.0" language: "matlab" path: "modules/temperature-monitor" mainClass: "TemperatureMonitor" capabilities: - "PUBLISH_TELEMETRY" - "READ_SENSORS"
- Start the Nexus host:
# Using default manifest name (nexus-manifest.yaml) dotnet Nexus.Host.dll # Using custom manifest name dotnet Nexus.Host.dll --manifest my-custom-manifest.yaml # Using manifest in different directory dotnet Nexus.Host.dll --manifest /path/to/config/production-manifest.yaml
- Copy your packed library (.lvlibp) to the host's
modules/
directory - Add your module to
nexus-manifest.yaml
:modules: - id: "temperature-monitor" name: "Temperature Monitor" version: "1.0.0" language: "labview" path: "modules/TemperatureMonitor.lvlibp" mainVI: "TemperatureMonitor.vi" capabilities: - "PUBLISH_TELEMETRY" - "READ_SENSORS"
- Start the Nexus host:
# Using default manifest name (nexus-manifest.yaml) dotnet Nexus.Host.dll # Using custom manifest name dotnet Nexus.Host.dll --manifest my-custom-manifest.yaml # Using manifest in different directory dotnet Nexus.Host.dll --manifest /path/to/config/production-manifest.yaml
That's It!
The Nexus host will:
- Load your module (DLL, Python package, C++ library, MATLAB files, or LabVIEW VIs)
- Find your module class using language-specific discovery:
- C#: Classes with [Module] attribute
- Python: Classes decorated with @module
- C++: Classes registered with NEXUS_MODULE_EXPORT
- MATLAB: Classes inheriting from nexus.sdk.ModuleBase
- LabVIEW: VIs implementing the module interface
- Inject IModuleContext with all required services
- Call your lifecycle methods at the appropriate times
- Handle all message routing and transport (you never see gRPC, HTTP, etc.)
Remember: Your module only uses the SDK interfaces. The host provides all the actual implementations!
Module Manifest
The module manifest is a YAML configuration file that defines how modules are loaded, configured, and integrated into the NEXUS-1 system. As a module developer, you need to understand how to write manifest entries for your modules.
nexus-manifest.yaml
) contains the entire NEXUS-1 application configuration. Your module entries will be added to the modules
section of this file.
Full Manifest Structure
Complete nexus-manifest.yaml Example
Here's a complete manifest file showing where your module configuration fits:
# nexus-manifest.yaml
version: "1.0"
# Application metadata (configured by system administrator)
application:
name: "NEXUS-1 Industrial Control System"
version: "1.0.0"
description: "Mission-critical industrial application orchestrator"
author: "NEXUS-1 Team"
tags:
- industrial
- automation
- orchestrator
# Runtime configuration (configured by system administrator)
runtime:
isolation: process # process | container | vm
monitoring:
healthCheckInterval: 30s
restartPolicy: onFailure # never | onFailure | always
maxRestarts: 3
restartWindow: 5m
enableMetrics: true
enableTracing: true
security:
enableEncryption: true
enableAuthentication: true
certificatePath: "./certs/nexus.pfx"
messaging:
transport: grpc
port: 5000
maxMessageSize: 4194304 # 4MB
defaultTimeout: 30s
# Module definitions - YOUR MODULES GO HERE
modules:
# Example: Temperature Monitor Module
- id: "temperature-monitor"
name: "Temperature Monitor"
language: "python"
path: "modules/temperature-monitor"
version: "1.0.0"
priority: 500 # Module priority for capability routing
critical: false
dependencies:
- "sensor-driver"
capabilities:
- "READ_SENSORS"
- "PUBLISH_TELEMETRY"
- "GENERATE_ALERTS"
resources:
memory: "512MB"
cpu: 1.0
disk: "100MB"
config:
sensorTypes:
- "thermocouple"
- "rtd"
- "thermistor"
sampleRate: 1s
alertThresholds:
high: 85.0
critical: 95.0
healthCheck:
interval: 10s
timeout: 5s
failureThreshold: 3
replication:
mode: activePassive
replicas: 2
# Add more modules here...
# Global configuration (optional)
globalConfig:
environment: "production"
logLevel: "information"
timezone: "UTC"
culture: "en-US"
Module Properties Reference
Each module entry in the manifest can include the following properties:
Property | Type | Required | Description |
---|---|---|---|
id |
string | Yes | Unique identifier for the module. Use kebab-case (e.g., "my-module") |
name |
string | Yes | Human-readable module name |
version |
string | Yes | Module version using semantic versioning (e.g., "1.0.0") |
language |
string | Yes | Module language: csharp , python , cpp , matlab , labview |
path |
string | Yes* | Path to module directory (Python, LabVIEW, MATLAB) |
assembly |
string | Yes* | Path to .NET assembly (C# modules only) |
library |
string | Yes* | Path to shared library (C++ modules only) |
critical |
boolean | No | If true, module failure will shut down the entire system (default: false) |
priority |
integer | No | Module capability priority (0-1000). Used for routing when multiple modules provide the same capability. Higher = better quality/reliability. Default: 500 |
capabilities |
array | Yes | List of required capabilities. Must match available system capabilities |
dependencies |
array | No | List of other module IDs this module depends on |
config |
object | No | Module-specific configuration parameters |
resources |
object | No | Resource limits: memory , cpu , disk |
healthCheck |
object | No | Health check configuration |
replication |
object | No | Module replication settings for high availability |
* Note: Use path
for Python/LabVIEW/MATLAB, assembly
for C#, or library
for C++
Language-Specific Manifest Examples
Here are complete manifest entries for modules in each supported language:
# C# Module Manifest Entry
modules:
- id: "plc-controller"
name: "PLC Controller"
language: "csharp"
assembly: "modules/bin/PlcController.dll"
version: "1.0.0"
priority: 800 # High priority for critical PLC operations
critical: true
capabilities:
- "READ_PLC_DATA"
- "WRITE_CONTROLS"
- "MONITOR_ALARMS"
dependencies:
- "modbus-driver"
config:
plcAddress: "192.168.1.100"
plcPort: 502
pollInterval: 1000
reconnectDelay: 5s
tags:
temperature: "DB1.DBD0"
pressure: "DB1.DBD4"
flowRate: "DB1.DBD8"
resources:
memory: "1GB"
cpu: 2.0
disk: "500MB"
healthCheck:
interval: 10s
timeout: 5s
failureThreshold: 3
# Python Module Manifest Entry
modules:
- id: "temperature-monitor"
name: "Temperature Monitor"
language: "python"
path: "modules/temperature-monitor"
version: "1.0.0"
priority: 500 # Standard priority sensor
critical: false
dependencies:
- "sensor-driver"
capabilities:
- "READ_SENSORS"
- "PUBLISH_TELEMETRY"
- "GENERATE_ALERTS"
config:
sensorTypes:
- "thermocouple"
- "rtd"
- "thermistor"
sampleRate: 1s
alertThresholds:
high: 85.0
critical: 95.0
calibration:
offset: 0.0
scale: 1.0
resources:
memory: "512MB"
cpu: 1.0
disk: "100MB"
healthCheck:
interval: 10s
timeout: 5s
replication:
mode: activePassive
replicas: 2
# C++ Module Manifest Entry
modules:
- id: "realtime-controller"
name: "Realtime Controller"
language: "cpp"
library: "modules/libRealtimeController.so" # Linux
# library: "modules/RealtimeController.dll" # Windows
version: "3.0.0"
critical: true
capabilities:
- "HARDWARE_ACCESS"
- "REALTIME_CONTROL"
- "LOW_LEVEL_IO"
config:
controlLoopHz: 1000
priority: 95
cpuAffinity: [4, 5]
memoryLocked: true
devicePath: "/dev/rtcontrol0"
resources:
memory: "2GB"
cpu: 2.0
disk: "200MB"
healthCheck:
interval: 5s
timeout: 2s
maxLatencyMs: 5
# MATLAB Module Manifest Entry
modules:
- id: "signal-analyzer"
name: "Signal Analyzer"
language: "matlab"
path: "modules/signal-analyzer"
matlabFile: "SignalAnalyzer.m" # Main class file
version: "1.2.0"
critical: false
capabilities:
- "SIGNAL_PROCESSING"
- "FFT_ANALYSIS"
- "DATA_VISUALIZATION"
dependencies:
- "data-acquisition"
config:
sampleRate: 48000
fftSize: 2048
windowType: "hanning"
overlapPercent: 50
enableGpu: true
channels: [1, 2, 3, 4]
resources:
memory: "4GB"
cpu: 2.0
disk: "1GB"
healthCheck:
interval: 15s
timeout: 10s
# LabVIEW Module Manifest Entry
modules:
- id: "daq-controller"
name: "Data Acquisition Controller"
language: "labview"
path: "modules/daq-controller"
viFile: "DAQController.vi" # Main VI
version: "2.5.0"
critical: false
capabilities:
- "HARDWARE_DAQ"
- "SIGNAL_ACQUISITION"
- "WAVEFORM_GENERATION"
dependencies:
- "signal-analyzer"
config:
device: "cDAQ1"
channels:
analogIn: ["ai0:7"]
analogOut: ["ao0:1"]
digitalIO: ["port0"]
sampleRate: 10000
bufferSize: 100000
triggerMode: "continuous"
clockSource: "internal"
resources:
memory: "2GB"
cpu: 2.0
disk: "500MB"
healthCheck:
interval: 10s
timeout: 5s
Accessing Configuration in Your Module
Reading Configuration Values
Your module can access configuration values from the manifest through the SDK:
public class PlcController : ModuleBase
{
protected override void OnInitialized()
{
// Access configuration values
var plcAddress = Configuration["plcAddress"];
var plcPort = Configuration.GetValue("plcPort");
var pollInterval = Configuration.GetValue("pollInterval");
// Access nested configuration
var tempTag = Configuration["tags:temperature"];
// Bind configuration to a class
var plcConfig = new PlcConfiguration();
Configuration.Bind(plcConfig);
}
}
public class PlcConfiguration
{
public string PlcAddress { get; set; }
public int PlcPort { get; set; }
public int PollInterval { get; set; }
public Dictionary Tags { get; set; }
}
class TemperatureMonitor(Module):
def initialize(self):
# Access configuration values
sample_rate = self.config.get('sampleRate', '1s')
sensor_types = self.config.get('sensorTypes', [])
# Access nested configuration
high_threshold = self.config.get('alertThresholds', {}).get('high', 85.0)
critical_threshold = self.config.get('alertThresholds', {}).get('critical', 95.0)
# Access with defaults
calibration = self.config.get('calibration', {
'offset': 0.0,
'scale': 1.0
})
self.logger.info(f"Configured with sample rate: {sample_rate}")
self.logger.info(f"Monitoring sensors: {sensor_types}")
class RealtimeController : public nexus::ModuleBase {
protected:
void on_initialized() override {
// Access configuration values
auto loop_hz = config()->get("controlLoopHz", 1000);
auto priority = config()->get("priority", 90);
// Access array configuration
auto cpu_affinity = config()->get_array("cpuAffinity");
// Access boolean configuration
auto memory_locked = config()->get("memoryLocked", false);
// Access string configuration
auto device_path = config()->get("devicePath");
logger()->info("Control loop frequency: {} Hz", loop_hz);
}
};
Advanced Features
Module Dependencies
Specify dependencies between modules to ensure proper startup order:
modules:
- id: "data-logger"
name: "Data Logger"
version: "1.0.0"
language: "csharp"
assembly: "modules/DataLogger.dll"
dependencies:
- "temperature-monitor" # Must start after temperature-monitor
- "pressure-monitor" # Must start after pressure-monitor
capabilities:
- "STORE_DATA"
- "QUERY_HISTORY"
Health Check Configuration
Configure how NEXUS-1 monitors your module's health:
modules:
- id: "critical-monitor"
name: "Critical System Monitor"
version: "1.0.0"
language: "csharp"
assembly: "modules/CriticalMonitor.dll"
critical: true # System stops if this module fails
healthCheck:
interval: 10s # Check every 10 seconds
timeout: 5s # Timeout after 5 seconds
failureThreshold: 3 # Unhealthy after 3 failures
successThreshold: 1 # Healthy after 1 success
Environment Variables in Configuration
Use environment variables for sensitive or environment-specific values:
modules:
- id: "database-connector"
name: "Database Connector"
version: "1.0.0"
language: "python"
path: "modules/db-connector"
config:
# Environment variable substitution
connectionString: "${DB_CONNECTION}"
apiKey: "${API_KEY}"
# With default values
port: "${DB_PORT:-5432}"
environment: "${ENV:-production}"
# Nested configuration
retry:
maxAttempts: "${MAX_RETRIES:-3}"
delayMs: 1000
Resource Limits
Define resource constraints for your module:
modules:
- id: "analytics-engine"
name: "Analytics Engine"
version: "1.0.0"
language: "python"
path: "modules/analytics"
resources:
memory: "4GB" # Maximum memory usage
cpu: 2.0 # Maximum CPU cores (2.0 = 2 cores)
disk: "10GB" # Maximum disk usage
Module Replication
Configure high availability with module replication:
modules:
- id: "message-processor"
name: "Message Processor"
version: "1.0.0"
language: "csharp"
assembly: "modules/MessageProcessor.dll"
replication:
mode: activePassive # activePassive or activeActive
replicas: 3 # Number of instances
loadBalancing: roundRobin # Load balancing strategy
Module Path Resolution
Understanding Module Path Resolution
NEXUS-1 uses intelligent path resolution to locate your modules, making deployments more reliable across different environments. Module paths in the manifest are resolved in the following order:
- Absolute Paths: If the path is absolute, it's used as-is
- Manifest-Relative Paths: Paths are resolved relative to the directory containing the manifest file
- Application Directory: Falls back to the NEXUS-1 application directory
- Current Working Directory: Finally tries the current working directory
Path Resolution Examples
# Example: Manifest located at /app/config/nexus-manifest.yaml
modules:
# Absolute path - used directly
- id: "module1"
language: "csharp"
assembly: "/opt/nexus/modules/Module1.dll"
# Relative path - resolved from manifest directory first
- id: "module2"
language: "csharp"
assembly: "../modules/Module2.dll"
# Resolves to: /app/modules/Module2.dll
# Subdirectory - resolved from manifest directory
- id: "module3"
language: "csharp"
assembly: "modules/Module3.dll"
# Resolves to: /app/config/modules/Module3.dll
# Python module with directory path
- id: "python-module"
language: "python"
path: "../python-modules/sensor-reader"
# Resolves to: /app/python-modules/sensor-reader/
Best Practice: Use Relative Paths
Use paths relative to your manifest file for maximum portability. This ensures your modules are found correctly when:
- Running NEXUS-1 from different directories
- Deploying to different environments
- Moving your application to a new server
Module Path Configuration
# Windows Path Examples
modules:
# Absolute path
- id: "plc-controller"
assembly: "C:\\Nexus\\modules\\PlcController.dll"
# Relative to manifest
- id: "sensor-module"
assembly: "..\\modules\\SensorModule.dll"
# UNC network path
- id: "shared-module"
assembly: "\\\\server\\nexus-modules\\SharedModule.dll"
# Linux/macOS Path Examples
modules:
# Absolute path
- id: "plc-controller"
assembly: "/opt/nexus/modules/PlcController.dll"
# Relative to manifest
- id: "sensor-module"
assembly: "../modules/SensorModule.dll"
# Home directory
- id: "user-module"
assembly: "~/nexus-modules/UserModule.dll"
# Docker Path Examples
modules:
# Container absolute path
- id: "plc-controller"
assembly: "/app/modules/PlcController.dll"
# Volume-mounted modules
- id: "sensor-module"
assembly: "/modules/SensorModule.dll"
# Relative to workdir
- id: "local-module"
assembly: "./modules/LocalModule.dll"
Debugging Path Resolution
If NEXUS-1 can't find your module, it provides detailed error messages showing all searched locations:
Failed to load module 'sensor-module' (Temperature Sensor)
Assembly: ../modules/TemperatureSensor.dll
Searched locations:
[✗] /app/config/../modules/TemperatureSensor.dll
[✗] /opt/nexus/bin/../modules/TemperatureSensor.dll
[✓] /app/../modules/TemperatureSensor.dll
[✗] /current/dir/../modules/TemperatureSensor.dll
Context:
Manifest directory: /app/config
Current directory: /current/dir
Application directory: /opt/nexus/bin/
Tip: Ensure the module DLL exists at one of the searched locations.
Module paths are resolved relative to the manifest file's directory.
Common Path Resolution Issues
- Spaces in paths: Always quote paths containing spaces
- Case sensitivity: Linux paths are case-sensitive
- Symbolic links: Resolved to their targets
- Network paths: Ensure proper permissions and availability
Message Bus Configuration
Overview
NEXUS-1 supports multiple message bus implementations to accommodate different deployment scenarios and requirements. The message bus is the core communication infrastructure that enables modules to interact with each other through publish/subscribe, request/response, and streaming patterns.
Available Message Bus Types
Configuration Examples
# Using gRPC (default)
{
"MessageBus": {
"Type": "grpc",
"Grpc": {
"Port": 5000,
"MaxMessageSize": 4194304,
"EnableTls": false
}
}
}
# Using NATS
{
"MessageBus": {
"Type": "nats",
"Nats": {
"Url": "nats://localhost:4222",
"Name": "nexus-host",
"ReconnectWait": 2000,
"MaxReconnects": 10,
"EnableTls": false
}
}
}
# Using RabbitMQ
{
"MessageBus": {
"Type": "rabbitmq",
"RabbitMQ": {
"HostName": "localhost",
"Port": 5672,
"UserName": "guest",
"Password": "guest",
"VirtualHost": "/",
"Exchange": "nexus-exchange",
"EnableTls": false
}
}
}
# Select message bus type
export MessageBus__Type=nats
# NATS configuration
export MessageBus__Nats__Url=nats://nats-cluster.example.com:4222
export MessageBus__Nats__Name=nexus-production
export MessageBus__Nats__EnableTls=true
export MessageBus__Nats__TlsCert=/certs/client-cert.pem
export MessageBus__Nats__TlsKey=/certs/client-key.pem
# RabbitMQ configuration
export MessageBus__Type=rabbitmq
export MessageBus__RabbitMQ__HostName=rabbitmq.example.com
export MessageBus__RabbitMQ__Port=5672
export MessageBus__RabbitMQ__UserName=nexus_user
export MessageBus__RabbitMQ__Password=secure_password
export MessageBus__RabbitMQ__VirtualHost=/nexus
export MessageBus__RabbitMQ__EnableTls=true
# In nexus-manifest.yaml
runtime:
messaging:
transport: nats # Options: grpc, nats, rabbitmq
config:
# NATS-specific configuration
url: nats://nats-cluster.example.com:4222
cluster_id: nexus-cluster
client_id: nexus-host-01
# Connection resilience
reconnect_wait: 2s
max_reconnects: -1 # Infinite
# TLS configuration
tls:
enabled: true
ca_cert: /certs/ca.crt
client_cert: /certs/client.crt
client_key: /certs/client.key
Feature Comparison
Feature | gRPC | NATS | RabbitMQ |
---|---|---|---|
Pub/Sub | ✓ | ✓ | ✓ |
Request/Reply | ✓ | ✓ | ✓ |
Streaming | ✓ | ✓ | ✓ |
Message Persistence | ✗ | ✓* | ✓ |
Clustering | ✗ | ✓ | ✓ |
Rate Limiting | ✓ | ✓ | ✓ |
Auto-Reconnect | ✓ | ✓ | ✓ |
External Dependency | None | NATS Server | RabbitMQ Server |
Protocol | HTTP/2 | TCP | AMQP |
Best Latency | < 1ms | < 1ms | 1-5ms |
* NATS requires JetStream to be enabled for persistence
Deployment Considerations
When to use gRPC (default)
- All modules run on the same host or within the same network
- You need the lowest possible latency
- You don't want to manage external infrastructure
- Your deployment is relatively simple and self-contained
When to use NATS
- Modules are distributed across multiple hosts or cloud regions
- You need automatic failover and high availability
- You're building a cloud-native application
- You need subject-based routing with wildcards
- You want to scale horizontally with minimal configuration
When to use RabbitMQ
- You have existing RabbitMQ infrastructure
- You need complex routing rules and message transformations
- Message durability and guaranteed delivery are critical
- You need dead letter queues for failed message handling
- You require extensive monitoring and management tools
Rate Limiting
All message bus implementations support rate limiting to prevent module overload:
// In your module code
protected override async Task OnInitializeAsync()
{
// Check if the message bus supports configuration
if (MessageBus is IConfigurableMessageBus configurable)
{
// Set rate limit to 100 messages per second
configurable.SetRateLimit(100);
// Get current rate limit
var currentLimit = configurable.GetRateLimit();
Logger.LogInformation($"Rate limit set to {currentLimit} msg/s");
}
}
Connection Resilience
All adapters implement automatic reconnection with exponential backoff:
Resilience Features
- Automatic Reconnection: Adapters automatically reconnect on connection loss
- Exponential Backoff: Prevents overwhelming the server with reconnection attempts
- Message Buffering: Some adapters buffer messages during disconnection
- Health Reporting: Connection status is reported to the health monitoring system
- Graceful Degradation: Modules continue to function with reduced capabilities during disconnection
Important Notes
- Message bus type must be configured before starting NEXUS-1
- All modules in a deployment must use the same message bus type
- Ensure the selected message bus infrastructure is available before starting
- For NATS and RabbitMQ, ensure proper network connectivity and authentication
Capability Declaration - Hybrid System
How the Hybrid System Works
The hybrid capability system provides maximum flexibility by combining compile-time and runtime capabilities:
- Attributes (Compile-time): Declare capabilities the module knows about in code
- Manifest (Runtime): Add additional capabilities without recompiling
- Result: Module has ALL capabilities from both sources (union)
Step 1: Declare Capabilities in Code
NEW: Use ModuleCapability Attributes
Declare capabilities your module provides using attributes. These are automatically enabled:
// C# Example - Declare capabilities in code
[Module("sensor-module", "1.0.0")]
[ModuleCapability("READ_TEMPERATURE", Required = true, Description = "Read temperature data")]
[ModuleCapability("READ_PRESSURE", Description = "Read pressure data")]
[ModuleCapability("CALIBRATE_SENSORS", Required = false, Description = "Sensor calibration")]
[ModuleCapability("DIAGNOSTIC_MODE", Description = "Extended diagnostics")]
public class SensorModule : ModuleBase
{
protected override async Task OnInitializeAsync()
{
// Check which capabilities are enabled
if (IsCapabilityEnabled("DIAGNOSTIC_MODE"))
{
Logger.LogInformation("Diagnostics enabled");
}
// All capabilities from attributes are automatically enabled
// Manifest can add additional capabilities at runtime
}
}
Step 2: Add Additional Capabilities in Manifest
The manifest can add additional capabilities beyond those declared in code:
# Add runtime capabilities to extend module functionality
modules:
- id: "sensor-module"
name: "Sensor Module"
version: "1.0.0"
capabilities: # ← Add extra capabilities at runtime
# All attribute capabilities are already enabled:
# - READ_TEMPERATURE (from attributes - required)
# - READ_PRESSURE (from attributes)
# - CALIBRATE_SENSORS (from attributes)
# - DIAGNOSTIC_MODE (from attributes)
# Add additional capabilities at runtime:
- "EXPORT_CSV" # New capability added via manifest
- "REMOTE_CONFIG" # New capability added via manifest
- "API_ACCESS" # New capability added via manifest
priority: 500 # Priority for capability routing
type: "dotnet"
assembly: "modules/SensorModule.dll"
Validation and Safety
- Capabilities from attributes are automatically enabled
- Manifest can add additional capabilities at runtime
- Final capabilities = Attributes ∪ Manifest (union)
- Required capabilities (from attributes) must remain available
Example: Understanding the Union Behavior
// Module declares these capabilities via attributes:
[ModuleCapability("READ_DATA")]
[ModuleCapability("WRITE_DATA")]
[ModuleCapability("PROCESS_DATA")]
// Manifest adds these capabilities:
capabilities:
- "EXPORT_JSON"
- "EXPORT_XML"
- "API_ACCESS"
// Result - Module has ALL these capabilities:
// ✓ READ_DATA (from attributes)
// ✓ WRITE_DATA (from attributes)
// ✓ PROCESS_DATA (from attributes)
// ✓ EXPORT_JSON (from manifest)
// ✓ EXPORT_XML (from manifest)
// ✓ API_ACCESS (from manifest)
// In code, all are available:
if (IsCapabilityEnabled("READ_DATA")) { } // true
if (IsCapabilityEnabled("EXPORT_JSON")) { } // true
Logger.LogInfo($"Total capabilities: {EnabledCapabilities.Count}"); // 6
Common Capabilities
Standard System Capabilities
Here are common capabilities that modules can declare in their manifest:
Capability | Description | Risk Level |
---|---|---|
READ_SENSORS |
Read sensor data | Low |
WRITE_CONTROLS |
Write to control systems | High |
PUBLISH_TELEMETRY |
Publish telemetry data | Low |
GENERATE_ALERTS |
Generate system alerts | Medium |
STORE_DATA |
Store data persistently | Medium |
QUERY_HISTORY |
Query historical data | Low |
HARDWARE_ACCESS |
Direct hardware access | Critical |
REALTIME_CONTROL |
Real-time control operations | Critical |
Validation and Troubleshooting
Common Manifest Issues
- Invalid YAML syntax: Use a YAML validator to check your syntax
- Missing required fields: Ensure id, name, version, language, and capabilities are present
- Path not found: Verify that assembly/path/library points to the correct location
- Unknown capabilities: Check that requested capabilities are available in the system
- Circular dependencies: Ensure modules don't depend on each other in a circle
Module Loading Order
Modules are loaded in the following order:
- Modules with no dependencies
- Modules whose dependencies are already loaded
- Critical modules are prioritized within each group
If your module isn't starting, check:
- All dependencies are listed and available
- No circular dependencies exist
- Required capabilities are granted
API Reference
The Nexus SDK Client provides the ModuleBase class, interfaces, and contracts for module development. The SDK includes both interfaces and the ModuleBase implementation - the host provides the runtime services.
Important Architecture Note
Nexus-1 has TWO separate interface implementations:
- SDK Client (
Nexus.SDK.Client
) - Simplified interfaces with ModuleBase class for module developers (use this!) - Core Contracts (
Nexus.Contracts
) - Full interfaces used internally by the host
Always inherit from ModuleBase - do NOT implement IModule directly!
Core Interfaces
IModule Interface and ModuleBase Class
The IModule interface defines the module contract. Always use ModuleBase which implements this interface and provides:
- Automatic logger creation from context
- Easy access to MessageBus
- Module metadata and capabilities from manifest
- Simplified lifecycle methods
namespace Nexus.Contracts
{
///
/// Core interface for all Nexus modules
///
public interface IModule
{
///
/// Initialize the module with context from the host
///
Task InitializeAsync(IModuleContext context);
///
/// Start the module
///
Task StartAsync(CancellationToken cancellationToken);
///
/// Stop the module gracefully
///
Task StopAsync(CancellationToken cancellationToken);
///
/// Get current health status
///
Task CheckHealthAsync();
}
}
# Python Module Base Class
from nexus_sdk import Module, Message
from typing import Optional, Dict, Any
import asyncio
class Module:
"""Base class for all Nexus modules"""
def __init__(self):
self.messages = None # Injected by runtime
self.logger = None # Injected by runtime
self.config = None # Injected by runtime
def initialize(self) -> None:
"""Called when module is initialized"""
pass
async def start(self) -> None:
"""Called when module starts"""
pass
async def stop(self) -> None:
"""Called when module stops"""
pass
async def get_health(self) -> Dict[str, Any]:
"""Return module health status"""
return {"status": "healthy"}
# Helper methods
def subscribe(self, topic: str, handler):
"""Subscribe to message topic"""
pass
async def publish(self, topic: str, payload: Any):
"""Publish message to topic"""
pass
// C++ Module Base Class
namespace nexus {
class ModuleBase {
public:
ModuleBase(const std::string& id,
const std::string& name,
const std::string& version);
virtual ~ModuleBase() = default;
// Lifecycle methods
virtual Result initialize() { return success(); }
virtual Result start() { return success(); }
virtual Result stop() { return success(); }
virtual Result get_health();
protected:
// Injected services
MessageClient* messages();
Logger* logger();
Config* config();
// Lifecycle hooks
virtual void on_initialized() {}
virtual void on_starting() {}
virtual void on_stopping() {}
};
// Helper macro for module registration
#define NEXUS_MODULE(id, name, version) \
static nexus::ModuleRegistrar \
_nexus_registrar(id, name, version);
} // namespace nexus
% MATLAB Module Base Class
classdef Module < handle
% Base class for all Nexus modules
properties (Access = protected)
messages % Message client
logger % Logger instance
config % Configuration
end
properties (Abstract, Constant)
Name % Module name
Description % Module description
Version % Module version
end
methods
function obj = Module()
% Constructor - services injected by runtime
end
function initialize(obj)
% Called when module is initialized
end
function start(obj)
% Called when module starts
end
function stop(obj)
% Called when module stops
end
function health = getHealth(obj)
% Return module health status
health = struct('status', 'healthy');
end
end
methods (Access = protected)
function subscribe(obj, topic, callback)
% Subscribe to message topic
obj.messages.subscribe(topic, callback);
end
function publish(obj, topic, payload)
% Publish message to topic
obj.messages.publish(topic, payload);
end
end
end
// LabVIEW Module API
// Module VIs are organized in a library (.lvlib)
// Required VIs for Module Implementation:
// =====================================
// 1. Initialize.vi
// Inputs:
// - Module Context (cluster)
// Outputs:
// - Error Out
// Description: Called when module loads
// 2. Start.vi
// Inputs:
// - Module Reference
// Outputs:
// - Error Out
// Description: Starts module execution
// 3. Stop.vi
// Inputs:
// - Module Reference
// Outputs:
// - Error Out
// Description: Stops module execution
// 4. Get Health.vi
// Inputs:
// - Module Reference
// Outputs:
// - Health Status (cluster)
// - Error Out
// Helper VIs provided by SDK:
// ==========================
// - Nexus.Subscribe.vi
// Subscribe to message topics
// - Nexus.Publish.vi
// Publish messages
// - Nexus.Logger.vi
// Log messages
// - Nexus.Config.Get.vi
// Read configuration values
// Module Metadata (in library properties):
// Name: "My Module"
// Version: "1.0.0"
// Description: "Module description"
ModuleBase Class
IMPORTANT: Always inherit from ModuleBase instead of implementing IModule directly. ModuleBase provides all the infrastructure you need:
// ModuleBase provides these properties automatically:
public abstract class ModuleBase : IModule
{
// Public properties - automatically available
public ILogger Logger { get; } // ✅ Auto-created from context
public IMessageBus MessageBus { get; } // ✅ Provided via context
public Guid ModuleId { get; } // Module's unique ID
public string Name { get; } // Module name
public string Version { get; } // Module version
// Protected properties
protected ModuleInfo ModuleInfo { get; } // ✅ Contains capabilities from manifest
protected IModuleContext Context { get; } // Full context access
// NEW: Capability checking methods
protected bool IsCapabilityEnabled(string capability); // Check if capability is enabled (from attributes OR manifest)
protected IReadOnlyCollection EnabledCapabilities { get; } // All enabled capabilities (union of attributes + manifest)
protected IReadOnlyDictionary DeclaredCapabilities { get; } // Capabilities declared via attributes only
// Override these for your logic
protected virtual Task OnInitializeAsync(); // Your initialization
protected virtual Task OnShutdownAsync(); // Your cleanup
}
// Example usage with hybrid capabilities:
[Module("my-module", "1.0.0")]
[ModuleCapability("DATA_PROCESSING", Required = true)]
[ModuleCapability("ADVANCED_ANALYTICS", Required = false)]
public class MyModule : ModuleBase
{
protected override async Task OnInitializeAsync()
{
// Everything is provided - no need to create logger or message bus!
Logger.LogInformation("Module initialized");
// Check enabled capabilities
if (IsCapabilityEnabled("ADVANCED_ANALYTICS"))
{
Logger.LogInformation("Advanced analytics enabled");
// Initialize advanced features
}
// Log all active capabilities
Logger.LogInformation("Active capabilities: {Caps}", string.Join(", ", EnabledCapabilities));
}
}
Message Client API
public interface IMessageClient
{
// Send a message to a topic
Task SendAsync(string topic, object message,
CancellationToken cancellationToken = default);
// Subscribe to messages matching a pattern
Task SubscribeAsync(string topicPattern,
MessageHandler handler);
// Send request and wait for response
Task RequestAsync(string topic,
object request,
TimeSpan? timeout = null);
// Register a request handler
Task RegisterHandlerAsync(string topic,
RequestHandler handler);
}
class MessageClient:
"""Client for message-based communication"""
async def send(self, topic: str, message: Any) -> None:
"""Send a message to a topic"""
pass
def subscribe(self, pattern: str, handler: Callable) -> Subscription:
"""Subscribe to messages matching a pattern"""
pass
async def request(self, topic: str, request: Any,
timeout: float = 30.0) -> Any:
"""Send request and wait for response"""
pass
def register_handler(self, topic: str,
handler: Callable) -> Subscription:
"""Register a request handler"""
pass
# Usage in module:
class MyModule(Module):
async def handle_data(self, message: Message):
data = message.payload
# Process data
await self.publish('processed.data', result)
class MessageClient {
public:
// Send a message to a topic
template
Result send(const std::string& topic, const T& message);
// Subscribe to messages matching a pattern
Result subscribe(
const std::string& pattern,
std::function handler);
// Send request and wait for response
template
Result request(
const std::string& topic,
const TReq& request,
std::chrono::milliseconds timeout = 30s);
// Register a request handler
Result register_handler(
const std::string& topic,
std::function(const Message&)> handler);
};
// Usage in module:
void MyModule::on_initialized() {
messages()->subscribe("data.*",
[this](const Message& msg) {
handle_data(msg);
});
}
classdef MessageClient < handle
% Client for message-based communication
methods
function send(obj, topic, message)
% Send a message to a topic
% topic: string - Topic name
% message: struct/array - Message payload
end
function sub = subscribe(obj, pattern, callback)
% Subscribe to messages matching pattern
% pattern: string - Topic pattern (e.g., 'sensors.*')
% callback: function handle - @(message) handler
% Returns: Subscription object
end
function response = request(obj, topic, request, timeout)
% Send request and wait for response
% timeout: seconds (default: 30)
if nargin < 4
timeout = 30;
end
end
function sub = registerHandler(obj, topic, handler)
% Register a request handler
% handler: function handle - @(request) response
end
end
end
% Usage in module:
methods
function handleData(obj, message)
data = message.payload;
% Process data
result = processData(data);
obj.publish('processed.data', result);
end
end
// LabVIEW Message Client VIs
// Nexus.Send.vi
// Inputs:
// - Topic (string)
// - Message (variant/cluster)
// - Error In
// Outputs:
// - Error Out
// Description: Send message to topic
// Nexus.Subscribe.vi
// Inputs:
// - Pattern (string) - e.g., "sensors.*"
// - Message Handler VI Reference
// - Error In
// Outputs:
// - Subscription Reference
// - Error Out
// Description: Subscribe to topic pattern
// Nexus.Request.vi
// Inputs:
// - Topic (string)
// - Request (variant/cluster)
// - Timeout ms (U32, default: 30000)
// - Error In
// Outputs:
// - Response (variant)
// - Error Out
// Description: Send request and wait for response
// Nexus.RegisterHandler.vi
// Inputs:
// - Topic (string)
// - Request Handler VI Reference
// - Error In
// Outputs:
// - Subscription Reference
// - Error Out
// Example Message Handler VI:
// MessageHandler.vi
// Inputs:
// - Message (cluster: topic, payload, timestamp)
// Outputs:
// - Handled (boolean)
// - Error Out
Message Types
// Message received from the bus
public sealed class Message
{
public string Topic { get; }
public string PayloadJson { get; }
public DateTime Timestamp { get; }
// Deserialize payload to specific type
public T? GetPayload();
}
// Subscription handle
public interface ISubscription : IDisposable
{
string Id { get; }
}
// Health status
public sealed class HealthStatus
{
public bool IsHealthy { get; }
public string Message { get; }
public static HealthStatus Healthy(string message = "OK");
public static HealthStatus Unhealthy(string message);
}
# Message class
@dataclass
class Message:
"""Message received from the bus"""
topic: str
payload: Any
timestamp: datetime
def get_typed(self, type_class: Type[T]) -> T:
"""Get payload as specific type"""
pass
# Subscription handle
class Subscription:
"""Handle to an active subscription"""
def __init__(self, id: str):
self.id = id
def unsubscribe(self) -> None:
"""Unsubscribe from topic"""
pass
def __enter__(self):
return self
def __exit__(self, *args):
self.unsubscribe()
# Health status
@dataclass
class HealthStatus:
"""Module health status"""
is_healthy: bool
message: str = "OK"
@staticmethod
def healthy(message: str = "OK") -> 'HealthStatus':
return HealthStatus(True, message)
@staticmethod
def unhealthy(message: str) -> 'HealthStatus':
return HealthStatus(False, message)
// Message structure
struct Message {
std::string topic;
std::string payload_json;
std::chrono::system_clock::time_point timestamp;
// Get payload as specific type
template
std::optional get_payload() const;
};
// Subscription handle
class Subscription {
public:
const std::string& id() const;
void unsubscribe();
// RAII - automatic cleanup
~Subscription() { unsubscribe(); }
};
// Health status
struct HealthStatus {
bool is_healthy;
std::string message;
static HealthStatus healthy(const std::string& msg = "OK") {
return {true, msg};
}
static HealthStatus unhealthy(const std::string& msg) {
return {false, msg};
}
};
// Result type for error handling
template
class Result {
public:
bool is_ok() const;
bool is_error() const;
T& value();
const Error& error() const;
};
% Message structure
% message = struct with fields:
% topic: string - Message topic
% payload: any - Message data
% timestamp: datetime - Message timestamp
% Example message:
message = struct(...
'topic', 'sensors.temperature', ...
'payload', struct('value', 23.5, 'unit', 'C'), ...
'timestamp', datetime('now') ...
);
% Subscription class
classdef Subscription < handle
properties (SetAccess = private)
Id string
end
methods
function unsubscribe(obj)
% Unsubscribe from topic
end
function delete(obj)
% Auto cleanup
obj.unsubscribe();
end
end
end
% Health status structure
% Create healthy status:
healthStatus = struct(...
'isHealthy', true, ...
'message', 'OK' ...
);
% Create unhealthy status:
unhealthyStatus = struct(...
'isHealthy', false, ...
'message', 'Connection lost' ...
);
// LabVIEW Type Definitions
// Message.ctl (Cluster)
// Fields:
// - Topic (String)
// - Payload (Variant)
// - Timestamp (Timestamp)
// Subscription.ctl (Cluster)
// Fields:
// - ID (String)
// - Reference (U32)
// HealthStatus.ctl (Cluster)
// Fields:
// - IsHealthy (Boolean)
// - Message (String)
// Type Usage Examples:
// 1. Creating a Message:
// Use "Bundle By Name" to create message cluster
// Wire topic string, payload variant, and timestamp
// 2. Extracting Message Data:
// Use "Unbundle By Name" to access fields
// Use "Variant To Data" for typed payload
// 3. Health Status Creation:
// Healthy: Bundle TRUE with "OK" message
// Unhealthy: Bundle FALSE with error message
// Common Patterns:
// - Use Type Definitions (.ctl) for consistency
// - Store in project library for reuse
// - Use Variant attributes for metadata
Module Metadata
// Module attribute for metadata
[AttributeUsage(AttributeTargets.Class)]
public sealed class ModuleAttribute : Attribute
{
public string Id { get; }
public string Name { get; }
public string Version { get; }
public ModuleAttribute(string id, string name, string version);
}
// Usage:
[Module("my-module", "My Module", "1.0.0")]
public class MyModule : ModuleBase
{
// Module implementation
}
# Module decorator for metadata
def module(id: str, name: str, version: str):
"""Decorator to register module metadata"""
def decorator(cls):
cls._module_id = id
cls._module_name = name
cls._module_version = version
return cls
return decorator
# Usage:
@module("my-module", "My Module", "1.0.0")
class MyModule(Module):
"""Module implementation"""
pass
# Alternative: Class attributes
class MyModule(Module):
MODULE_ID = "my-module"
MODULE_NAME = "My Module"
MODULE_VERSION = "1.0.0"
// Module registration macro
#define NEXUS_MODULE(id, name, version) \
static const char* module_id() { return id; } \
static const char* module_name() { return name; } \
static const char* module_version() { return version; } \
static nexus::ModuleRegistrar \
_registrar(id, name, version);
// Usage:
class MyModule : public nexus::ModuleBase {
public:
MyModule() : ModuleBase("my-module", "My Module", "1.0.0") {}
// Module implementation
};
// Register the module
NEXUS_MODULE("my-module", "My Module", "1.0.0")
// Alternative: In-class declaration
class MyModule : public nexus::ModuleBase {
NEXUS_MODULE_METADATA("my-module", "My Module", "1.0.0")
public:
// Module implementation
};
% Module metadata via class properties
classdef MyModule < nexus.Module
properties (Constant)
% Required metadata
Name = 'my-module'
Description = 'My Module'
Version = '1.0.0'
% Optional metadata
Author = 'Your Name'
Dependencies = {"signal-processing", "data-logger"}
Capabilities = ["messages.publish", "messages.subscribe"]
end
methods
% Module implementation
end
end
% Alternative: Module info function
function info = getModuleInfo()
info = struct(...
'Name', 'my-module', ...
'Description', 'My Module', ...
'Version', '1.0.0', ...
'MainClass', 'MyModule' ...
);
end
// LabVIEW Module Metadata
// Option 1: Library Properties
// Right-click library (.lvlib) > Properties
// General Tab:
// - Name: "My Module"
// - Version: "1.0.0"
// Documentation Tab:
// - Description: "Module description"
// Option 2: Module Info VI
// Create GetModuleInfo.vi that returns cluster:
// Outputs:
// - Module Info (Cluster)
// - ID (String): "my-module"
// - Name (String): "My Module"
// - Version (String): "1.0.0"
// - Description (String)
// - Capabilities (String Array)
// Option 3: Module Configuration File
// MyModule.ini:
[Module]
ID=my-module
Name=My Module
Version=1.0.0
Description=Module description
MainVI=MyModule.vi
// Best Practice:
// Store metadata in library private data
// Access via property nodes in VIs
State Store
The NEXUS-1 SDK provides a built-in state persistence mechanism that allows modules to save and restore their state across restarts, upgrades, and system reboots. The State Store ensures data durability and consistency while handling the complexities of distributed storage.
Overview
Why Use State Store?
- Automatic Persistence: State survives module restarts and system reboots
- Type Safety: Generic methods ensure type-safe serialization
- Isolation: Each module's state is isolated from others
- Versioning: Handles state migration across module versions
- Performance: Optimized for both small and large state objects
API Reference
Accessing State Store
The State Store is available through the module context after initialization.
public class MyModule : ModuleBase
{
private IStateStore _stateStore;
protected override async Task OnInitialize()
{
// Access state store from context
_stateStore = Context.StateStore;
// Check if persistence is enabled
if (_stateStore.IsEnabled)
{
Logger.LogInformation("State persistence is enabled");
}
}
}
class MyModule(ModuleBase):
async def on_initialize(self):
# Access state store from context
self._state_store = self.context.state_store
# Check if persistence is enabled
if self._state_store.is_enabled:
self.logger.info("State persistence is enabled")
class MyModule : public ModuleBase {
protected:
void OnInitialize() override {
// Access state store from context
auto stateStore = GetContext()->GetStateStore();
// Check if persistence is enabled
if (stateStore->IsEnabled()) {
GetLogger()->Info("State persistence is enabled");
}
}
};
classdef MyModule < nexus.Module
methods (Access = protected)
function onInitialize(obj)
% Access state store from context
stateStore = obj.Context.StateStore;
% Check if persistence is enabled
if stateStore.IsEnabled
obj.Logger.info('State persistence is enabled');
end
end
end
end
Saving State
Save module state with automatic serialization and type safety.
// Save simple values
await _stateStore.SaveStateAsync(Id, "lastProcessedId", lastId);
await _stateStore.SaveStateAsync(Id, "configuration", config);
// Save complex objects
var state = new ProcessingState
{
ProcessedCount = 1000,
LastProcessedTime = DateTime.UtcNow,
ErrorCount = 0,
Metrics = new Dictionary
{
["avgProcessingTime"] = 45.2,
["throughput"] = 100.5
}
};
await _stateStore.SaveStateAsync(Id, "processingState", state);
// Save collections
var cache = new List { /* items */ };
await _stateStore.SaveStateAsync(Id, "cache", cache);
# Save simple values
await self._state_store.save_state(self.id, "last_processed_id", last_id)
await self._state_store.save_state(self.id, "configuration", config)
# Save complex objects
state = ProcessingState(
processed_count=1000,
last_processed_time=datetime.utcnow(),
error_count=0,
metrics={
"avg_processing_time": 45.2,
"throughput": 100.5
}
)
await self._state_store.save_state(self.id, "processing_state", state)
# Save collections
cache = [item1, item2, item3]
await self._state_store.save_state(self.id, "cache", cache)
// Save simple values
co_await stateStore->SaveStateAsync(GetId(), "lastProcessedId", lastId);
co_await stateStore->SaveStateAsync(GetId(), "configuration", config);
// Save complex objects
ProcessingState state;
state.processedCount = 1000;
state.lastProcessedTime = std::chrono::system_clock::now();
state.errorCount = 0;
state.metrics["avgProcessingTime"] = 45.2;
state.metrics["throughput"] = 100.5;
co_await stateStore->SaveStateAsync(GetId(), "processingState", state);
// Save collections
std::vector cache = { /* items */ };
co_await stateStore->SaveStateAsync(GetId(), "cache", cache);
% Save simple values
stateStore.saveState(obj.Id, 'lastProcessedId', lastId);
stateStore.saveState(obj.Id, 'configuration', config);
% Save complex objects
state = struct(...
'processedCount', 1000, ...
'lastProcessedTime', datetime('now', 'TimeZone', 'UTC'), ...
'errorCount', 0, ...
'metrics', containers.Map(...
{'avgProcessingTime', 'throughput'}, ...
{45.2, 100.5}));
stateStore.saveState(obj.Id, 'processingState', state);
% Save collections
cache = {item1, item2, item3};
stateStore.saveState(obj.Id, 'cache', cache);
Loading State
Load previously saved state with automatic deserialization.
// Load simple values
var lastId = await _stateStore.LoadStateAsync(Id, "lastProcessedId");
if (lastId != null)
{
Logger.LogInformation($"Resuming from ID: {lastId}");
}
// Load complex objects
var state = await _stateStore.LoadStateAsync(Id, "processingState");
if (state != null)
{
Logger.LogInformation($"Restored state: {state.ProcessedCount} items processed");
// Resume from saved state
_processedCount = state.ProcessedCount;
_lastProcessedTime = state.LastProcessedTime;
}
// Load with default value
var config = await _stateStore.LoadStateAsync(Id, "configuration")
?? new Configuration(); // Default if not found
// Load collections
var cache = await _stateStore.LoadStateAsync>(Id, "cache")
?? new List();
Deleting State
Remove state entries when they are no longer needed.
// Delete specific state key
await _stateStore.DeleteStateAsync(Id, "temporaryCache");
// Delete multiple keys
foreach (var key in obsoleteKeys)
{
await _stateStore.DeleteStateAsync(Id, key);
}
// Clean up on module stop
protected override async Task OnStop()
{
if (_clearStateOnStop)
{
await _stateStore.DeleteStateAsync(Id, "processingState");
await _stateStore.DeleteStateAsync(Id, "cache");
}
}
Common Patterns
Checkpoint Pattern
Save processing checkpoints to resume after interruptions.
public class DataProcessor : ModuleBase
{
private class ProcessingCheckpoint
{
public string LastProcessedId { get; set; }
public DateTime CheckpointTime { get; set; }
public long ProcessedCount { get; set; }
public Dictionary Metadata { get; set; }
}
private async Task ProcessDataBatch(List items)
{
var checkpoint = await LoadCheckpoint();
var startIndex = 0;
// Resume from checkpoint if available
if (checkpoint != null)
{
startIndex = items.FindIndex(i => i.Id == checkpoint.LastProcessedId) + 1;
Logger.LogInformation($"Resuming from checkpoint: {checkpoint.LastProcessedId}");
}
for (int i = startIndex; i < items.Count; i++)
{
await ProcessItem(items[i]);
// Save checkpoint every 100 items
if ((i + 1) % 100 == 0)
{
await SaveCheckpoint(new ProcessingCheckpoint
{
LastProcessedId = items[i].Id,
CheckpointTime = DateTime.UtcNow,
ProcessedCount = checkpoint?.ProcessedCount ?? 0 + i + 1,
Metadata = new Dictionary
{
["batchId"] = _currentBatchId,
["progress"] = (double)(i + 1) / items.Count
}
});
}
}
}
private async Task LoadCheckpoint()
{
return await Context.StateStore.LoadStateAsync(
Id, "processing_checkpoint");
}
private async Task SaveCheckpoint(ProcessingCheckpoint checkpoint)
{
await Context.StateStore.SaveStateAsync(
Id, "processing_checkpoint", checkpoint);
}
}
Configuration Caching Pattern
Cache configuration with automatic reload on changes.
class ConfigurableModule(ModuleBase):
def __init__(self):
super().__init__()
self._config_cache = None
self._config_version = None
async def get_configuration(self):
# Check if configuration changed
current_version = self.context.configuration.get("version", "1.0")
if self._config_version != current_version:
# Load from state store first
cached = await self.context.state_store.load_state(
self.id, "config_cache", dict
)
if cached and cached.get("version") == current_version:
self._config_cache = cached["config"]
self._config_version = current_version
self.logger.info("Loaded configuration from cache")
else:
# Build configuration and cache it
self._config_cache = await self._build_configuration()
await self.context.state_store.save_state(
self.id, "config_cache", {
"version": current_version,
"config": self._config_cache,
"cached_at": datetime.utcnow().isoformat()
}
)
self._config_version = current_version
self.logger.info("Built and cached new configuration")
return self._config_cache
async def _build_configuration(self):
# Expensive configuration building logic
config = {}
# ... build complex configuration
return config
State Migration Pattern
Handle state schema changes across module versions.
public class EvolvingModule : ModuleBase
{
private const int CURRENT_STATE_VERSION = 3;
private class ModuleStateV3
{
public int Version { get; set; } = CURRENT_STATE_VERSION;
public string Id { get; set; }
public DateTime LastUpdated { get; set; }
public Dictionary Metrics { get; set; }
public List ProcessedItems { get; set; } // Added in V3
}
protected override async Task OnInitialize()
{
await MigrateStateIfNeeded();
}
private async Task MigrateStateIfNeeded()
{
// Try to load current version
var state = await Context.StateStore.LoadStateAsync(Id, "module_state");
if (state == null)
{
// Try to load older versions
var oldState = await Context.StateStore.LoadStateAsync(Id, "module_state");
if (oldState != null)
{
state = MigrateState(oldState);
await Context.StateStore.SaveStateAsync(Id, "module_state", state);
Logger.LogInformation($"Migrated state from version {oldState.Version} to {CURRENT_STATE_VERSION}");
}
}
}
private ModuleStateV3 MigrateState(dynamic oldState)
{
var newState = new ModuleStateV3
{
Id = oldState.Id,
LastUpdated = oldState.LastUpdated ?? DateTime.UtcNow,
Metrics = new Dictionary()
};
// Migrate based on version
switch ((int)oldState.Version)
{
case 1:
// V1 had metrics as a list
if (oldState.MetricsList != null)
{
foreach (var metric in oldState.MetricsList)
{
newState.Metrics[metric.Name] = metric.Value;
}
}
break;
case 2:
// V2 had metrics as dictionary
newState.Metrics = oldState.Metrics;
break;
}
// Initialize new fields
newState.ProcessedItems = new List();
return newState;
}
}
Best Practices
State Store Guidelines
- Key Naming: Use descriptive, hierarchical keys (e.g., "cache.user.profile")
- State Size: Keep individual state objects reasonably sized (< 1MB recommended)
- Versioning: Include version numbers in state objects for migration
- Cleanup: Delete temporary state when no longer needed
- Error Handling: Always handle null returns from LoadStateAsync
- Atomicity: State operations are atomic per key, not across keys
- Performance: Cache frequently accessed state in memory
- Security: Don't store sensitive data without encryption
- Testing: Test with both enabled and disabled state store
- Recovery: Design for graceful handling of missing state
⚠️ Important Considerations
- State Store may be disabled in some deployments - always check IsEnabled
- State is scoped to module ID - state is lost if module ID changes
- Large objects may impact performance - consider chunking
- State is not shared between modules - use message bus for sharing
- Backup important state externally for disaster recovery
Capability-Based Routing & Quality of Service
The Nexus SDK provides powerful capability-based routing that enables dynamic service discovery, quality of service (QoS) tiers, and resilient system design. Instead of hardcoding module names, you can request services by their capabilities.
Why Use Capability-Based Routing?
- Dynamic Service Discovery - Find modules by what they can do, not by their names
- Quality of Service (QoS) - Request high-priority modules for critical operations
- Graceful Degradation - Automatically fall back to lower-priority alternatives
- Cost Optimization - Use expensive resources only when necessary
- Zero Configuration - No need to update code when modules change
Understanding Priority-Based Routing
How It Works
When multiple modules provide the same capability, they each have a priority (0-1000):
- 0-299: Low priority - Backup/fallback modules
- 300-699: Standard priority - Normal operations
- 700-1000: High priority - Premium/critical services
When you request a service, you specify the minimum acceptable priority. The host:
- Finds all healthy modules with the required capability
- Filters modules that meet your minimum priority requirement
- Routes to the lowest-priority module that still meets your requirement
- Handles failover to the next available module if needed
Core APIs for Capability Routing
1. Request by Capability
Send requests to modules based on their capabilities:
// Basic capability request (accepts any priority)
var response = await MessageBus.RequestByCapabilityAsync(
capability: "READ_SENSOR",
request: new ReadSensorRequest { SensorId = "temp-001" }
);
// Request with minimum priority for critical operations
var criticalResponse = await MessageBus.RequestByCapabilityAsync(
capability: "READ_SENSOR",
request: new ReadSensorRequest { SensorId = "reactor-temp" },
minimumPriority: 800, // Require high-quality sensor
timeout: TimeSpan.FromSeconds(5)
);
// Handle the response
if (criticalResponse.Success)
{
Logger.LogInformation("Temperature: {Temp}°C from high-priority sensor",
criticalResponse.Temperature);
}
else
{
Logger.LogError("Failed to read critical sensor: {Error}",
criticalResponse.ErrorMessage);
}
2. Check Capability Availability
Verify if a capability is available before making requests:
// Check if any module provides a capability
if (await MessageBus.IsCapabilityAvailableAsync("WRITE_PLC_DATA"))
{
// Safe to make PLC write requests
}
// Check if high-priority service is available
if (await MessageBus.IsCapabilityAvailableAsync("EMERGENCY_SHUTDOWN", minimumPriority: 900))
{
Logger.LogInformation("Emergency shutdown system is available");
}
else
{
Logger.LogWarning("No high-priority emergency shutdown available!");
}
// Implement graceful degradation
var requiredPriority = 800;
while (requiredPriority >= 0)
{
if (await MessageBus.IsCapabilityAvailableAsync("DATA_PROCESSING", requiredPriority))
{
Logger.LogInformation("Data processing available at priority {Priority}", requiredPriority);
break;
}
requiredPriority -= 200; // Try lower priority tiers
}
3. Discover Capability Providers
Get detailed information about modules providing a capability:
// Get all providers of a capability
var allProviders = await MessageBus.GetCapabilityProvidersAsync("READ_SENSOR");
foreach (var provider in allProviders.OrderByDescending(p => p.Priority))
{
Logger.LogInformation("Provider: {Name} | Priority: {Priority} | Healthy: {Healthy}",
provider.ModuleName, provider.Priority, provider.IsHealthy);
}
// Get only high-priority providers for audit
var criticalProviders = await MessageBus.GetCapabilityProvidersAsync(
"AUDIT_LOGGING",
minimumPriority: 700
);
if (!criticalProviders.Any(p => p.IsHealthy))
{
Logger.LogCritical("No healthy high-priority audit providers available!");
}
// Build a provider selection strategy
var providers = await MessageBus.GetCapabilityProvidersAsync("DATA_STORAGE");
var selectedProvider = providers
.Where(p => p.IsHealthy)
.Where(p => p.Priority >= 500) // Minimum acceptable quality
.OrderBy(p => p.Priority) // Prefer lower priority (cheaper)
.FirstOrDefault();
if (selectedProvider != null)
{
Logger.LogInformation("Selected {Provider} at priority {Priority} for cost optimization",
selectedProvider.ModuleName, selectedProvider.Priority);
}
Real-World Scenarios
Scenario 1: Multi-Tier Sensor System
Different sensor qualities for different use cases:
public class SensorService
{
private readonly IMessageBus _messageBus;
private readonly ILogger _logger;
public async Task ReadTemperatureAsync(string location, bool isCritical)
{
try
{
// Critical readings require high-precision sensors
var minimumPriority = isCritical ? 800 : 200;
var response = await _messageBus.RequestByCapabilityAsync(
capability: "READ_TEMPERATURE",
request: new ReadTempRequest { Location = location },
minimumPriority: minimumPriority,
timeout: TimeSpan.FromSeconds(isCritical ? 10 : 5)
);
if (response.Success)
{
_logger.LogInformation("Got temperature {Temp}°C from {Module} (Priority: {Priority})",
response.Temperature, response.ModuleName, response.ModulePriority);
return response.Temperature;
}
_logger.LogWarning("Temperature reading failed: {Error}", response.ErrorMessage);
return null;
}
catch (TimeoutException)
{
_logger.LogError("Temperature reading timed out for {Location}", location);
return null;
}
}
}
Scenario 2: Cost-Optimized Data Processing
Use expensive high-performance modules only when needed:
public class DataProcessor
{
private readonly IMessageBus _messageBus;
public async Task ProcessDataAsync(DataSet data, ProcessingRequirements requirements)
{
// Determine required priority based on requirements
int minimumPriority = requirements switch
{
ProcessingRequirements.RealTime => 900, // GPU-accelerated modules
ProcessingRequirements.HighSpeed => 700, // Multi-core optimized
ProcessingRequirements.Standard => 400, // Standard processing
ProcessingRequirements.Background => 100, // Low-priority batch
_ => 0
};
// Check availability at required tier
if (!await _messageBus.IsCapabilityAvailableAsync("DATA_PROCESSING", minimumPriority))
{
// Fallback logic
if (requirements == ProcessingRequirements.RealTime)
{
throw new InvalidOperationException("Real-time processing not available");
}
// Try lower tier
minimumPriority = Math.Max(0, minimumPriority - 200);
}
// Process with appropriate module
var result = await _messageBus.RequestByCapabilityAsync(
capability: "DATA_PROCESSING",
request: new ProcessRequest { Data = data },
minimumPriority: minimumPriority
);
Logger.LogInformation("Processed {Count} items using {Module} (cost tier: {Priority})",
data.Count, result.ProcessedBy, minimumPriority);
}
}
Scenario 3: Health Monitoring Dashboard
Monitor capability availability across the system:
public class SystemHealthMonitor
{
private readonly IMessageBus _messageBus;
private readonly string[] _criticalCapabilities =
{
"EMERGENCY_SHUTDOWN",
"AUDIT_LOGGING",
"SECURITY_MONITORING",
"DATA_BACKUP"
};
public async Task CheckSystemHealthAsync()
{
var report = new SystemHealthReport();
foreach (var capability in _criticalCapabilities)
{
var providers = await _messageBus.GetCapabilityProvidersAsync(capability);
var status = new CapabilityStatus
{
Capability = capability,
TotalProviders = providers.Length,
HealthyProviders = providers.Count(p => p.IsHealthy),
HighPriorityProviders = providers.Count(p => p.Priority >= 700),
HighestPriority = providers.Max(p => p.Priority)
};
// Check critical thresholds
if (status.HealthyProviders == 0)
{
report.CriticalIssues.Add($"No healthy providers for {capability}");
}
else if (capability == "EMERGENCY_SHUTDOWN" && status.HighPriorityProviders == 0)
{
report.Warnings.Add($"No high-priority providers for {capability}");
}
report.CapabilityStatuses.Add(status);
}
return report;
}
}
Best Practices
Priority Guidelines
Priority Range | Use Case | Examples |
---|---|---|
900-1000 | Mission-critical, real-time | Emergency shutdown, safety systems, medical devices |
700-899 | High-priority production | Primary sensors, control systems, audit logging |
400-699 | Standard operations | Normal processing, standard sensors, regular storage |
200-399 | Non-critical tasks | Reporting, analytics, background processing |
0-199 | Fallback/development | Test modules, simulators, backup systems |
Error Handling
try
{
var response = await MessageBus.RequestByCapabilityAsync(
capability: "CRITICAL_OPERATION",
request: request,
minimumPriority: 800,
timeout: TimeSpan.FromSeconds(30)
);
if (!response.Success)
{
// Handle application-level failure
Logger.LogError("Operation failed: {Error}", response.ErrorMessage);
}
}
catch (InvalidOperationException ex)
{
// No modules available with required capability/priority
Logger.LogError(ex, "No suitable module found");
}
catch (TimeoutException ex)
{
// Request timed out
Logger.LogError(ex, "Operation timed out");
}
catch (Exception ex)
{
// Other errors (network, serialization, etc.)
Logger.LogError(ex, "Unexpected error in capability request");
}
Performance Considerations
- Cache capability lookups - Provider information changes infrequently
- Use appropriate timeouts - Higher priority modules may have longer startup times
- Monitor module health - Unhealthy modules are automatically excluded from routing
- Plan for degradation - Always have a fallback strategy when high-priority modules aren't available
Message Handling
Master the NEXUS-1 SDK's message handling capabilities to build robust inter-module communication. This section covers message structure, APIs, and best practices.
Message Structure
Core Message Components
Message Class Reference
// Message structure available to modules
public class Message
{
public string Topic { get; }
public object Payload { get; }
public Dictionary Headers { get; }
public MessageMetadata Metadata { get; }
// Helper methods
public T GetPayload();
public string GetHeader(string key, string defaultValue = null);
public bool HasHeader(string key);
}
public class MessageMetadata
{
public string MessageId { get; }
public string SenderId { get; }
public DateTime Timestamp { get; }
public string CorrelationId { get; }
public int Version { get; }
}
Publishing Messages
public class SensorModule : ModuleBase
{
// Simple publish
public async Task PublishReading(double value)
{
await Messages.PublishAsync("sensors.temperature", new
{
Value = value,
Unit = "celsius",
Timestamp = DateTime.UtcNow
});
}
// Publish with headers
public async Task PublishWithMetadata(SensorData data)
{
var headers = new Dictionary
{
["sensor-id"] = data.SensorId,
["location"] = data.Location,
["priority"] = "high",
["retention"] = "7d"
};
await Messages.PublishAsync("sensors.data", data, headers);
}
// Publish with options
public async Task PublishHighFrequency(DataPoint data)
{
var options = new PublishOptions
{
Headers = new Dictionary { ["type"] = "telemetry" },
Format = SerializationFormat.MessagePack,
Compression = CompressionType.LZ4,
Priority = MessagePriority.High,
Expiration = TimeSpan.FromMinutes(5)
};
await Messages.PublishAsync("telemetry.realtime", data, options);
}
// Fire and forget pattern
public void PublishNoWait(string status)
{
_ = Messages.PublishAsync("module.status", new { Status = status })
.ContinueWith(t =>
{
if (t.IsFaulted)
Logger.LogError("Publish failed: {Error}", t.Exception);
});
}
// Batch publishing
public async Task PublishBatch(IEnumerable readings)
{
var publishTasks = readings.Select(reading =>
Messages.PublishAsync($"sensors.{reading.Type}", reading)
);
await Task.WhenAll(publishTasks);
}
}
class SensorModule(Module):
# Simple publish
async def publish_reading(self, value: float):
await self.messages.publish('sensors.temperature', {
'value': value,
'unit': 'celsius',
'timestamp': datetime.utcnow().isoformat()
})
# Publish with headers
async def publish_with_metadata(self, data: SensorData):
headers = {
'sensor-id': data.sensor_id,
'location': data.location,
'priority': 'high',
'retention': '7d'
}
await self.messages.publish('sensors.data', data, headers=headers)
# Publish with options
async def publish_high_frequency(self, data: DataPoint):
await self.messages.publish(
'telemetry.realtime',
data,
headers={'type': 'telemetry'},
format='messagepack',
compression='lz4',
priority='high',
expiration=300 # 5 minutes
)
# Fire and forget pattern
def publish_no_wait(self, status: str):
asyncio.create_task(
self._publish_with_error_handling('module.status', {'status': status})
)
async def _publish_with_error_handling(self, topic: str, payload: dict):
try:
await self.messages.publish(topic, payload)
except Exception as e:
self.logger.error(f"Publish failed: {e}")
# Batch publishing
async def publish_batch(self, readings: List[SensorReading]):
tasks = [
self.messages.publish(f'sensors.{reading.type}', reading.to_dict())
for reading in readings
]
await asyncio.gather(*tasks, return_exceptions=True)
# Publishing with transaction context
async def publish_transactional(self, data: dict):
async with self.messages.transaction() as tx:
await tx.publish('data.raw', data)
processed = self.process_data(data)
await tx.publish('data.processed', processed)
# All messages sent atomically on context exit
#include
class SensorModule : public nexus::ModuleBase {
public:
// Simple publish
void publish_reading(double value) {
nlohmann::json payload = {
{"value", value},
{"unit", "celsius"},
{"timestamp", std::chrono::system_clock::now()}
};
messages()->publish("sensors.temperature", payload);
}
// Publish with headers
void publish_with_metadata(const SensorData& data) {
std::map headers = {
{"sensor-id", data.sensor_id},
{"location", data.location},
{"priority", "high"},
{"retention", "7d"}
};
messages()->publish("sensors.data", data, headers);
}
// Publish with options
void publish_high_frequency(const DataPoint& data) {
nexus::PublishOptions options;
options.headers["type"] = "telemetry";
options.format = nexus::SerializationFormat::MessagePack;
options.compression = nexus::CompressionType::LZ4;
options.priority = nexus::MessagePriority::High;
options.expiration = std::chrono::minutes(5);
messages()->publish("telemetry.realtime", data, options);
}
// Async publish with callback
void publish_async(const std::string& status) {
messages()->publish_async("module.status",
{{"status", status}},
[this](const nexus::PublishResult& result) {
if (!result.success) {
logger()->error("Publish failed: {}", result.error);
}
});
}
// Batch publishing
void publish_batch(const std::vector& readings) {
std::vector> futures;
for (const auto& reading : readings) {
futures.push_back(
messages()->publish_async(
fmt::format("sensors.{}", reading.type),
reading
)
);
}
// Wait for all
for (auto& future : futures) {
try {
future.get();
} catch (const std::exception& e) {
logger()->error("Batch publish failed: {}", e.what());
}
}
}
// Publishing with custom serialization
void publish_binary(const BinaryData& data) {
// Convert to bytes
std::vector bytes = data.serialize();
messages()->publish_raw("binary.data", bytes, {
{"content-type", "application/octet-stream"},
{"encoding", "custom-binary"}
});
}
};
classdef SensorModule < nexus.Module
methods
% Simple publish
function publishReading(obj, value)
payload = struct(...
'value', value, ...
'unit', 'celsius', ...
'timestamp', datetime('now', 'TimeZone', 'UTC') ...
);
obj.publish('sensors.temperature', payload);
end
% Publish with headers
function publishWithMetadata(obj, data)
headers = containers.Map();
headers('sensor-id') = data.sensorId;
headers('location') = data.location;
headers('priority') = 'high';
headers('retention') = '7d';
obj.publish('sensors.data', data, 'Headers', headers);
end
% Publish with options
function publishHighFrequency(obj, data)
options = struct();
options.Headers = containers.Map({'type'}, {'telemetry'});
options.Format = 'messagepack';
options.Compression = 'lz4';
options.Priority = 'high';
options.Expiration = seconds(300); % 5 minutes
obj.publish('telemetry.realtime', data, options);
end
% Fire and forget pattern
function publishNoWait(obj, status)
% Use parfeval for async publish
f = parfeval(@(o,s) o.publishAsync('module.status', struct('status', s)), ...
0, obj, status);
% Add error handler
afterEach(f, @(~) [], @(err) obj.logger.error('Publish failed: %s', err.message));
end
% Batch publishing
function publishBatch(obj, readings)
% Create futures for parallel publishing
futures = parallel.FevalFuture.empty(length(readings), 0);
for i = 1:length(readings)
reading = readings(i);
topic = sprintf('sensors.%s', reading.type);
futures(i) = parfeval(@obj.publish, 0, topic, reading);
end
% Wait for all
try
wait(futures);
catch ME
obj.logger.error('Batch publish failed: %s', ME.message);
end
end
% Publishing with validation
function publishValidated(obj, data)
% Validate data before publishing
try
obj.validateData(data);
% Add validation timestamp
data.validatedAt = datetime('now', 'TimeZone', 'UTC');
obj.publish('data.validated', data);
catch ME
obj.logger.error('Validation failed: %s', ME.message);
% Publish to error topic
errorPayload = struct(...
'originalData', data, ...
'error', ME.message, ...
'timestamp', datetime('now', 'TimeZone', 'UTC') ...
);
obj.publish('data.validation.errors', errorPayload);
end
end
% Publishing arrays efficiently
function publishArray(obj, sensorId, values)
% For large arrays, use binary format
if length(values) > 1000
% Convert to bytes
byteData = typecast(values, 'uint8');
headers = containers.Map();
headers('content-type') = 'application/matlab-array';
headers('array-type') = 'double';
headers('array-length') = num2str(length(values));
obj.publishBinary('arrays.large', byteData, 'Headers', headers);
else
% Small arrays as JSON
obj.publish('arrays.small', struct('sensorId', sensorId, 'values', values));
end
end
end
end
// LabVIEW Message Publishing Patterns
// 1. Simple Publish VI
// === PublishReading.vi ===
// Inputs:
// - Module Reference (DVR)
// - Value (DBL)
// Implementation:
// 1. Create Message Cluster:
// - value (DBL)
// - unit (String) = "celsius"
// - timestamp (Timestamp)
// 2. Convert to JSON (Nexus.JSON.Serialize)
// 3. Nexus.MessageBus.Publish
// - Topic: "sensors.temperature"
// - Payload: JSON String
// 2. Publish with Headers
// === PublishWithHeaders.vi ===
// Inputs:
// - Module Reference
// - Data (Cluster)
// - Headers (Variant - Map)
// Implementation:
// 1. Create Headers Map:
// - Insert "sensor-id" -> Data.SensorId
// - Insert "location" -> Data.Location
// - Insert "priority" -> "high"
// 2. Nexus.MessageBus.PublishWithHeaders
// - Topic: "sensors.data"
// - Payload: Data
// - Headers: Map
// 3. High-Performance Publishing
// === PublishHighFrequency.vi ===
// Best practices for speed:
// 1. Pre-allocate message buffers
// 2. Use MessagePack format
// 3. Batch small messages
// 4. Use Producer/Consumer pattern
// Implementation:
// [Producer Loop] --> Queue --> [Consumer Loop]
// |
// v
// [Batch & Publish]
// 4. Async Publish Pattern
// === PublishAsync.vi ===
// Using Start Asynchronous Call:
// 1. Create VI Reference to PublishWorker.vi
// 2. Start Asynchronous Call
// - VI Reference
// - Inputs: Topic, Payload
// - Options: Fire & Forget
// 3. Optional: Monitor with Async Call Status
// 5. Batch Publishing
// === PublishBatch.vi ===
// Inputs:
// - Readings Array (1D Array of Clusters)
// Implementation:
// For Loop with Parallelism Enabled:
// - Iteration Parallelism = # CPU Cores
// - Inside Loop:
// a. Format topic: "sensors.%s" with reading.type
// b. Nexus.MessageBus.Publish
// - Use Conditional Terminal for early exit on error
// 6. Message Builder Pattern
// === MessageBuilder.lvclass ===
// Methods:
// - SetTopic(String)
// - SetPayload(Variant)
// - AddHeader(Key, Value)
// - SetFormat(Enum)
// - SetCompression(Boolean)
// - Build() -> Message Reference
// - Publish() -> Error
// Usage:
// Builder.SetTopic("data.processed")
// .SetPayload(data)
// .AddHeader("version", "2.0")
// .SetFormat(MessagePack)
// .Publish()
// 7. Error Handling Pattern
// === SafePublish.vi ===
// Wrapper with retry logic:
// 1. Try-Catch structure (Error Case Structure)
// 2. On Error:
// a. Log error
// b. Retry with exponential backoff
// c. After N failures -> Dead Letter Queue
// 3. Success -> Clear error
// 8. Performance Tips:
// - Use Subroutine priority for high-freq publishing
// - Disable debugging on production VIs
// - Use In Place Element structures
// - Pre-compile message templates
// - Consider UDP for non-critical telemetry
// Example: Complete Publisher VI
// [Initialize] --> [Configure Headers Map]
// |
// v
// [While Loop - Running]
// |
// v
// [Dequeue Message Data]
// |
// v
// [Format Selection Case]
// / | \
// JSON MessagePack Binary
// \ | /
// v
// [Add Timestamp Header]
// |
// v
// [Nexus.MessageBus.Publish]
// |
// v
// [Error Handler]
// |
// v
// [Update Metrics]
Subscribing to Messages
public class ProcessorModule : ModuleBase
{
protected override async Task OnInitializeAsync()
{
// Simple subscription
await Messages.SubscribeAsync("sensors.temperature", HandleTemperature);
// Pattern subscription (wildcards)
await Messages.SubscribeAsync("sensors.*", HandleAnySensor);
await Messages.SubscribeAsync("alerts.**.critical", HandleCriticalAlerts);
// Subscription with filter
await Messages.SubscribeAsync("data.raw", HandleHighPriorityData,
filter: msg => msg.Headers.ContainsKey("priority") &&
msg.Headers["priority"] == "high");
// Typed subscription
await Messages.SubscribeAsync(
"sensors.temperature.typed",
HandleTypedTemperature);
// Subscription with options
var options = new SubscriptionOptions
{
Filter = msg => msg.Metadata.SenderId != ModuleInfo.Id, // Ignore own messages
ErrorHandler = HandleSubscriptionError,
MaxConcurrency = 5, // Process max 5 messages concurrently
BufferSize = 1000, // Internal queue size
AcknowledgmentMode = AckMode.Manual
};
await Messages.SubscribeAsync("commands.*", HandleCommand, options);
}
private async Task HandleTemperature(Message message)
{
var data = message.GetPayload();
Logger.LogInformation("Temperature: {Value}{Unit} from {Sensor}",
data.value, data.unit, message.GetHeader("sensor-id"));
// Access metadata
Logger.LogDebug("Message ID: {Id}, Sent at: {Time}",
message.Metadata.MessageId,
message.Metadata.Timestamp);
}
private async Task HandleTypedTemperature(TemperatureReading reading, Message message)
{
// Strongly typed handler - payload already deserialized
if (reading.Value > 100)
{
await Messages.PublishAsync("alerts.temperature.high", new
{
Reading = reading,
Threshold = 100,
Timestamp = DateTime.UtcNow
});
}
}
private async Task HandleCommand(Message message)
{
try
{
// Process command
var command = message.GetPayload();
await ProcessCommand(command);
// Manual acknowledgment
await message.AcknowledgeAsync();
}
catch (Exception ex)
{
// Negative acknowledgment - message will be redelivered
await message.RejectAsync(requeue: true);
throw;
}
}
private void HandleSubscriptionError(Message message, Exception error)
{
Logger.LogError(error, "Error processing message on topic {Topic}", message.Topic);
// Optionally publish to dead letter queue
_ = Messages.PublishAsync($"dlq.{message.Topic}", new
{
OriginalMessage = message,
Error = error.ToString(),
FailedAt = DateTime.UtcNow
});
}
// Unsubscribe when needed
private ISubscription _subscription;
public async Task StartMonitoring()
{
_subscription = await Messages.SubscribeAsync("metrics.*", HandleMetrics);
}
public async Task StopMonitoring()
{
await _subscription.UnsubscribeAsync();
}
}
class ProcessorModule(Module):
async def on_initialize(self):
# Simple subscription
await self.messages.subscribe('sensors.temperature', self.handle_temperature)
# Pattern subscription (wildcards)
await self.messages.subscribe('sensors.*', self.handle_any_sensor)
await self.messages.subscribe('alerts.**.critical', self.handle_critical_alerts)
# Subscription with filter
await self.messages.subscribe(
'data.raw',
self.handle_high_priority_data,
filter=lambda msg: msg.headers.get('priority') == 'high'
)
# Typed subscription with validation
await self.messages.subscribe(
'sensors.temperature.typed',
self.handle_typed_temperature,
message_type=TemperatureReading
)
# Subscription with options
options = SubscriptionOptions(
filter=lambda msg: msg.metadata.sender_id != self.info.id, # Ignore own
error_handler=self.handle_subscription_error,
max_concurrency=5, # Process max 5 messages concurrently
buffer_size=1000, # Internal queue size
acknowledgment_mode='manual'
)
await self.messages.subscribe('commands.*', self.handle_command, options)
# Multiple handlers for same topic
await self.messages.subscribe('events.user', self.log_event)
await self.messages.subscribe('events.user', self.process_event)
await self.messages.subscribe('events.user', self.audit_event)
async def handle_temperature(self, message: Message):
data = message.get_payload()
self.logger.info(f"Temperature: {data['value']}{data['unit']} "
f"from {message.get_header('sensor-id')}")
# Access metadata
self.logger.debug(f"Message ID: {message.metadata.message_id}, "
f"Sent at: {message.metadata.timestamp}")
async def handle_typed_temperature(self, reading: TemperatureReading, message: Message):
# Strongly typed handler
if reading.value > 100:
await self.messages.publish('alerts.temperature.high', {
'reading': reading.to_dict(),
'threshold': 100,
'timestamp': datetime.utcnow().isoformat()
})
async def handle_command(self, message: Message):
try:
# Process command
command = Command.from_dict(message.get_payload())
await self.process_command(command)
# Manual acknowledgment
await message.acknowledge()
except Exception as e:
# Negative acknowledgment - message will be redelivered
await message.reject(requeue=True)
raise
def handle_subscription_error(self, message: Message, error: Exception):
self.logger.error(f"Error processing message on topic {message.topic}: {error}")
# Publish to dead letter queue
asyncio.create_task(
self.messages.publish(f'dlq.{message.topic}', {
'original_message': message.to_dict(),
'error': str(error),
'failed_at': datetime.utcnow().isoformat()
})
)
# Context manager for temporary subscriptions
@asynccontextmanager
async def temporary_subscription(self, topic: str, handler):
subscription = await self.messages.subscribe(topic, handler)
try:
yield subscription
finally:
await subscription.unsubscribe()
# Using temporary subscription
async def monitor_for_duration(self, duration: float):
async with self.temporary_subscription('metrics.*', self.handle_metrics) as sub:
await asyncio.sleep(duration)
self.logger.info(f"Received {sub.message_count} metrics")
class ProcessorModule : public nexus::ModuleBase {
protected:
void on_initialized() override {
// Simple subscription
messages()->subscribe("sensors.temperature",
[this](const nexus::Message& msg) { handle_temperature(msg); });
// Pattern subscription (wildcards)
messages()->subscribe("sensors.*",
[this](const nexus::Message& msg) { handle_any_sensor(msg); });
messages()->subscribe("alerts.**.critical",
[this](const nexus::Message& msg) { handle_critical_alerts(msg); });
// Subscription with filter
messages()->subscribe("data.raw",
[this](const nexus::Message& msg) { handle_high_priority_data(msg); },
nexus::SubscriptionOptions()
.with_filter([](const nexus::Message& msg) {
return msg.has_header("priority") &&
msg.get_header("priority") == "high";
})
);
// Typed subscription
messages()->subscribe("sensors.temperature.typed",
[this](const TemperatureReading& reading, const nexus::Message& msg) {
handle_typed_temperature(reading, msg);
});
// Subscription with advanced options
auto options = nexus::SubscriptionOptions()
.with_filter([this](const nexus::Message& msg) {
return msg.metadata().sender_id() != info().id; // Ignore own
})
.with_error_handler([this](const nexus::Message& msg, const std::exception& e) {
handle_subscription_error(msg, e);
})
.with_max_concurrency(5)
.with_buffer_size(1000)
.with_acknowledgment_mode(nexus::AckMode::Manual);
messages()->subscribe("commands.*",
[this](const nexus::Message& msg) { handle_command(msg); },
options);
}
private:
void handle_temperature(const nexus::Message& message) {
auto data = message.get_payload();
logger()->info("Temperature: {}{} from {}",
data["value"], data["unit"], message.get_header("sensor-id"));
// Access metadata
logger()->debug("Message ID: {}, Sent at: {}",
message.metadata().message_id(),
message.metadata().timestamp());
}
void handle_typed_temperature(const TemperatureReading& reading,
const nexus::Message& message) {
if (reading.value > 100) {
messages()->publish("alerts.temperature.high", {
{"reading", reading},
{"threshold", 100},
{"timestamp", std::chrono::system_clock::now()}
});
}
}
void handle_command(const nexus::Message& message) {
try {
// Process command
auto command = message.get_payload();
process_command(command);
// Manual acknowledgment
message.acknowledge();
} catch (const std::exception& e) {
// Negative acknowledgment
message.reject(true); // requeue = true
throw;
}
}
void handle_subscription_error(const nexus::Message& message,
const std::exception& error) {
logger()->error("Error processing message on topic {}: {}",
message.topic(), error.what());
// Publish to dead letter queue
messages()->publish(fmt::format("dlq.{}", message.topic()), {
{"original_message", message},
{"error", error.what()},
{"failed_at", std::chrono::system_clock::now()}
});
}
// Subscription management
std::unique_ptr monitoring_subscription_;
void start_monitoring() {
monitoring_subscription_ = messages()->subscribe("metrics.*",
[this](const nexus::Message& msg) { handle_metrics(msg); });
}
void stop_monitoring() {
monitoring_subscription_.reset(); // Automatically unsubscribes
}
// Advanced: Custom message dispatcher
class MessageDispatcher {
std::unordered_map> handlers_;
public:
void register_handler(const std::string& message_type,
std::function handler) {
handlers_[message_type] = handler;
}
void dispatch(const nexus::Message& message) {
auto type = message.get_header("type", "unknown");
if (auto it = handlers_.find(type); it != handlers_.end()) {
it->second(message);
} else {
// Default handler
logger()->warn("No handler for message type: {}", type);
}
}
};
};
classdef ProcessorModule < nexus.Module
properties (Access = private)
subscriptions = {} % Store subscription handles
messageQueue % For async processing
end
methods
function initialize(obj)
% Simple subscription
obj.subscribe('sensors.temperature', @obj.handleTemperature);
% Pattern subscription (wildcards)
obj.subscribe('sensors.*', @obj.handleAnySensor);
obj.subscribe('alerts.**.critical', @obj.handleCriticalAlerts);
% Subscription with filter
filterFunc = @(msg) isfield(msg.headers, 'priority') && ...
strcmp(msg.headers.priority, 'high');
obj.subscribe('data.raw', @obj.handleHighPriorityData, ...
'Filter', filterFunc);
% Typed subscription
obj.subscribe('sensors.temperature.typed', ...
@obj.handleTypedTemperature, ...
'MessageType', 'TemperatureReading');
% Subscription with options
options = struct();
options.Filter = @(msg) ~strcmp(msg.metadata.senderId, obj.info.id);
options.ErrorHandler = @obj.handleSubscriptionError;
options.MaxConcurrency = 5;
options.BufferSize = 1000;
options.AcknowledgmentMode = 'manual';
obj.subscribe('commands.*', @obj.handleCommand, options);
% Initialize async message queue
obj.messageQueue = parallel.pool.DataQueue;
afterEach(obj.messageQueue, @obj.processQueuedMessage);
end
function handleTemperature(obj, message)
data = message.payload;
obj.logger.info('Temperature: %.2f%s from %s', ...
data.value, data.unit, message.getHeader('sensor-id'));
% Access metadata
obj.logger.debug('Message ID: %s, Sent at: %s', ...
message.metadata.messageId, ...
datestr(message.metadata.timestamp));
end
function handleTypedTemperature(obj, reading, message)
% Strongly typed handler
if reading.value > 100
alert = struct(...
'reading', reading, ...
'threshold', 100, ...
'timestamp', datetime('now', 'TimeZone', 'UTC') ...
);
obj.publish('alerts.temperature.high', alert);
end
end
function handleCommand(obj, message)
try
% Process command
command = message.getPayload('Command');
obj.processCommand(command);
% Manual acknowledgment
message.acknowledge();
catch ME
% Negative acknowledgment
message.reject('Requeue', true);
rethrow(ME);
end
end
function handleSubscriptionError(obj, message, error)
obj.logger.error('Error processing message on topic %s: %s', ...
message.topic, error.message);
% Publish to dead letter queue
dlqPayload = struct(...
'originalMessage', message.toStruct(), ...
'error', error.message, ...
'failedAt', datetime('now', 'TimeZone', 'UTC') ...
);
obj.publish(sprintf('dlq.%s', message.topic), dlqPayload);
end
% Batch message processing
function subscribeBatch(obj, topic, batchSize, timeout)
batch = [];
timer = [];
function processBatch()
if ~isempty(batch)
obj.processBatchedMessages(batch);
batch = [];
end
end
function handleMessage(message)
batch(end+1) = message;
% Reset timer
if ~isempty(timer)
stop(timer);
delete(timer);
end
if length(batch) >= batchSize
processBatch();
else
% Set timeout for partial batch
timer = timer('ExecutionMode', 'singleShot', ...
'TimerFcn', @(~,~) processBatch(), ...
'StartDelay', timeout);
start(timer);
end
end
obj.subscribe(topic, @handleMessage);
end
% Async message processing
function handleAsync(obj, message)
% Queue for processing in parallel
send(obj.messageQueue, message);
end
function processQueuedMessage(obj, message)
% Process in worker thread
try
result = obj.heavyProcessing(message.payload);
obj.publish('results.processed', result);
catch ME
obj.logger.error('Async processing failed: %s', ME.message);
end
end
% Subscription management
function sub = subscribeTemporary(obj, topic, handler, duration)
sub = obj.subscribe(topic, handler);
% Auto-unsubscribe after duration
t = timer('ExecutionMode', 'singleShot', ...
'TimerFcn', @(~,~) sub.unsubscribe(), ...
'StartDelay', duration);
start(t);
end
end
end
// LabVIEW Message Subscription Patterns
// 1. Basic Subscription Setup
// === InitializeSubscriptions.vi ===
// Called during module initialization:
// 1. Nexus.MessageBus.Subscribe
// - Topic: "sensors.temperature"
// - Handler VI: HandleTemperature.vi
// - Returns: Subscription Reference
// 2. Store reference in module private data
// 2. Message Handler VI Template
// === HandleTemperature.vi ===
// Inputs:
// - Message (Cluster):
// - Topic (String)
// - Payload (Variant)
// - Headers (Variant Map)
// - Metadata (Cluster)
// Implementation:
// 1. Variant to Data (Payload -> Expected Type)
// 2. Extract Headers if needed
// 3. Process message data
// 4. Error handling with logging
// 3. Pattern Subscription
// === SubscribeWithWildcards.vi ===
// Examples:
// - "sensors.*" - All sensor messages
// - "alerts.**.critical" - Any critical alerts
// - "data.+" - Exactly one level
// Implementation:
// Multiple Subscribe calls with patterns
// Store refs in array for management
// 4. Filtered Subscription
// === SubscribeWithFilter.vi ===
// Create filter VI that returns Boolean:
// FilterHighPriority.vi:
// Input: Message
// Output: Boolean
// Logic: Check Headers["priority"] == "high"
// Then:
// Nexus.MessageBus.SubscribeFiltered
// - Topic: "data.raw"
// - Handler: ProcessData.vi
// - Filter: FilterHighPriority.vi
// 5. Subscription Manager Class
// === SubscriptionManager.lvclass ===
// Private Data:
// - Subscriptions (Array of References)
// - Handlers (Map of Topic->VI Ref)
// Methods:
// - AddSubscription(Topic, Handler)
// - RemoveSubscription(Topic)
// - PauseAll()
// - ResumeAll()
// - Cleanup()
// 6. Async Message Processing
// === AsyncMessageProcessor.vi ===
// Architecture:
// [Message Handler] --> [Queue] --> [Consumer Loop]
// |
// v
// [Process Heavy Task]
// Implementation:
// 1. Handler enqueues message
// 2. Consumer dequeues and processes
// 3. Parallel loops for throughput
// 7. Message Acknowledgment Pattern
// === HandleWithAck.vi ===
// For guaranteed delivery:
// 1. Process message
// 2. If success:
// - Message.Acknowledge()
// 3. If failure:
// - Message.Reject(Requeue=True)
// 4. Error propagation
// 8. Event-Based Message Handling
// === EventBasedSubscriber.vi ===
// Using User Events:
// 1. Create User Event (Message type)
// 2. In handler: Generate User Event
// 3. Event Structure handles messages
// Benefits:
// - Natural LabVIEW pattern
// - Easy UI integration
// - Built-in queuing
// 9. Message Router Pattern
// === MessageRouter.vi ===
// Route messages to different handlers:
// Case Structure on Message.Headers["type"]:
// "sensor": CallSensorHandler.vi
// "command": CallCommandHandler.vi
// "status": CallStatusHandler.vi
// default: LogUnhandled.vi
// 10. Performance Optimization
// === HighPerformanceSubscriber.vi ===
// For high message rates:
// 1. Pre-allocate arrays
// 2. Use circular buffers
// 3. Batch processing
// 4. Avoid property nodes in loops
// 5. Use subroutine priority
// Complete Subscription Example:
// === TemperatureMonitor.vi ===
// [Initialize]
// |
// v
// [Subscribe Multiple Topics]
// |
// +--> "sensors.temperature.*"
// | Handler: ProcessTemp.vi
// |
// +--> "config.temperature"
// | Handler: UpdateConfig.vi
// |
// +--> "commands.temperature.*"
// Handler: HandleCommands.vi
// Filter: CheckPermissions.vi
//
// [Main Loop]
// |
// v
// [Event Structure]
// |
// +--> User Event: Shutdown
// | - Call UnsubscribeAll.vi
// |
// +--> Timeout: 1000ms
// - Check subscription health
// - Update metrics
// Error Handling Best Practices:
// - Always wrap handlers in error clusters
// - Log errors but don't crash
// - Use error callback registration
// - Implement circuit breaker for bad messages
// - Dead letter queue for failed processing
Request/Response Pattern
Synchronous Communication
While NEXUS-1 is primarily asynchronous, the SDK provides request/response patterns for synchronous-style communication when needed.
// C# Request/Response
// Making a request
var response = await Messages.RequestAsync(
"service.getData",
new DataRequest { Id = "123", Type = "sensor" },
timeout: TimeSpan.FromSeconds(5)
);
// Handling requests
await Messages.HandleRequestsAsync(
"service.getData",
async (request, context) =>
{
var data = await LoadData(request.Id);
return new DataResponse { Data = data, Success = true };
}
);
// Python Request/Response
# Making a request
response = await self.messages.request(
'service.getData',
{'id': '123', 'type': 'sensor'},
timeout=5.0,
response_type=DataResponse
)
# Handling requests
@self.messages.handle_request('service.getData')
async def handle_get_data(request: DataRequest) -> DataResponse:
data = await self.load_data(request.id)
return DataResponse(data=data, success=True)
Message Headers and Metadata
Common Headers
Headers provide message context and routing information without affecting the payload:
content-type
- Payload MIME typeencoding
- Character encoding (e.g., utf-8)priority
- Message priority (low/normal/high/critical)ttl
- Time-to-live in secondstrace-id
- Distributed tracing identifiercorrelation-id
- Group related messagesreply-to
- Topic for responsessource
- Origin system/moduleversion
- Message schema version
Metadata Fields
System-generated metadata available on all messages:
MessageId
- Unique message identifierSenderId
- Module that sent the messageTimestamp
- When message was createdCorrelationId
- Links related messagesSequenceNumber
- Order in a sequenceRedeliveryCount
- Number of delivery attempts
Best Practices
Message Handling Guidelines
- Use Hierarchical Topics: Organize topics with dot notation (e.g.,
domain.category.specific
) - Keep Payloads Small: Large messages impact performance - consider streaming for big data
- Version Your Messages: Include version in headers for schema evolution
- Handle Errors Gracefully: Always implement error handlers for subscriptions
- Use Appropriate Patterns: Pub/sub for events, request/response for queries
- Set Message TTL: Prevent stale messages from being processed
- Leverage Headers: Use headers for metadata, not payload
- Implement Idempotency: Handle duplicate messages gracefully
- Monitor Message Flow: Track publish/subscribe metrics
- Document Message Contracts: Clear documentation for message formats
Message Streaming
Stream messages in real-time using async enumerable patterns for high-throughput scenarios.
Streaming API
The SDK provides streaming capabilities for continuous message flow without polling.
// Stream messages with pattern matching
await foreach (var envelope in MessageBus.StreamAsync("sensors.*", cancellationToken))
{
var data = envelope.Message;
var context = envelope.Context;
Logger.LogInformation($"Received {data.Value} from {context.SourceModuleName}");
// Process streaming data
await ProcessSensorData(data);
// Access message metadata
if (context.Headers.TryGetValue("priority", out var priority))
{
// Handle based on priority
}
}
// Stream with timeout
using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(5));
await foreach (var envelope in MessageBus.StreamAsync("alerts.critical", cts.Token))
{
await HandleCriticalAlert(envelope.Message);
}
# Stream messages with pattern matching
async for envelope in self.message_bus.stream("sensors.*", SensorData):
data = envelope.message
context = envelope.context
self.logger.info(f"Received {data.value} from {context.source_module_name}")
# Process streaming data
await self.process_sensor_data(data)
# Access message metadata
priority = context.headers.get("priority")
if priority:
# Handle based on priority
pass
# Stream with timeout
try:
async with asyncio.timeout(300): # 5 minutes
async for envelope in self.message_bus.stream("alerts.critical", Alert):
await self.handle_critical_alert(envelope.message)
except asyncio.TimeoutError:
self.logger.info("Stream timeout reached")
// Stream messages with pattern matching
auto stream = messageBus->StreamAsync("sensors.*", cancellationToken);
for co_await (const auto& envelope : stream)
{
const auto& data = envelope.Message;
const auto& context = envelope.Context;
logger->Info("Received " + std::to_string(data.Value) +
" from " + context.SourceModuleName);
// Process streaming data
co_await ProcessSensorData(data);
// Access message metadata
auto it = context.Headers.find("priority");
if (it != context.Headers.end())
{
// Handle based on priority
}
}
// Stream with timeout
auto cts = CancellationTokenSource(std::chrono::minutes(5));
auto alertStream = messageBus->StreamAsync("alerts.critical", cts.Token());
for co_await (const auto& envelope : alertStream)
{
co_await HandleCriticalAlert(envelope.Message);
}
Message Envelopes
Message envelopes provide rich context alongside your message payload, enabling sophisticated message processing patterns.
MessageEnvelope Structure
Every streamed message comes wrapped in an envelope containing metadata and context.
public class MessageEnvelope where T : class
{
public T Message { get; set; } // Your message payload
public MessageContext Context { get; set; } // Message metadata
}
public class MessageContext
{
public Guid MessageId { get; set; } // Unique message identifier
public string Topic { get; set; } // Message topic/pattern
public DateTime Timestamp { get; set; } // When message was sent
public Guid SourceModuleId { get; set; } // Sender module ID
public string SourceModuleName { get; set; } // Sender module name
public Dictionary Headers { get; set; } // Custom headers
public string? CorrelationId { get; set; } // For request correlation
// Message acknowledgment methods
public Task AckAsync(); // Acknowledge successful processing
public Task NackAsync(bool requeue = true); // Negative acknowledgment
}
Working with Message Context
Leverage message context for advanced routing, filtering, and processing decisions.
// Subscribe with manual acknowledgment
await MessageBus.SubscribeAsync(
async (message, context) =>
{
try
{
// Check message age
var age = DateTime.UtcNow - context.Timestamp;
if (age > TimeSpan.FromMinutes(5))
{
Logger.LogWarning($"Skipping stale message: {context.MessageId}");
await context.AckAsync(); // Acknowledge but don't process
return;
}
// Process based on headers
if (context.Headers.TryGetValue("version", out var version))
{
switch (version)
{
case "1.0":
await ProcessV1Order(message);
break;
case "2.0":
await ProcessV2Order(message);
break;
default:
Logger.LogError($"Unknown version: {version}");
await context.NackAsync(false); // Don't requeue
return;
}
}
// Track correlation
if (!string.IsNullOrEmpty(context.CorrelationId))
{
Logger.LogInformation($"Processing correlated message: {context.CorrelationId}");
}
// Manual acknowledgment
await context.AckAsync();
}
catch (Exception ex)
{
Logger.LogError(ex, "Failed to process order");
await context.NackAsync(true); // Requeue for retry
}
},
new SubscriptionOptions
{
AutoAck = false, // Manual acknowledgment
MaxConcurrency = 5
}
);
# Subscribe with manual acknowledgment
async def handle_order(message: OrderMessage, context: MessageContext):
try:
# Check message age
age = datetime.utcnow() - context.timestamp
if age > timedelta(minutes=5):
self.logger.warning(f"Skipping stale message: {context.message_id}")
await context.ack() # Acknowledge but don't process
return
# Process based on headers
version = context.headers.get("version")
if version == "1.0":
await self.process_v1_order(message)
elif version == "2.0":
await self.process_v2_order(message)
else:
self.logger.error(f"Unknown version: {version}")
await context.nack(requeue=False) # Don't requeue
return
# Track correlation
if context.correlation_id:
self.logger.info(f"Processing correlated message: {context.correlation_id}")
# Manual acknowledgment
await context.ack()
except Exception as e:
self.logger.error(f"Failed to process order: {e}")
await context.nack(requeue=True) # Requeue for retry
await self.message_bus.subscribe(
OrderMessage,
handle_order,
SubscriptionOptions(
auto_ack=False, # Manual acknowledgment
max_concurrency=5
)
)
Advanced Streaming Patterns
Buffered Stream Processing
Process streams in batches for improved efficiency.
public async Task ProcessStreamWithBuffer(string pattern, int bufferSize = 100)
where T : class
{
var buffer = new List>(bufferSize);
var flushTimer = new Timer(_ => FlushBuffer(buffer), null,
TimeSpan.FromSeconds(5), TimeSpan.FromSeconds(5));
try
{
await foreach (var envelope in MessageBus.StreamAsync(pattern, _cancellationToken))
{
buffer.Add(envelope);
if (buffer.Count >= bufferSize)
{
await ProcessBatch(buffer);
buffer.Clear();
}
}
}
finally
{
flushTimer?.Dispose();
if (buffer.Count > 0)
{
await ProcessBatch(buffer);
}
}
}
private async Task ProcessBatch(List> batch) where T : class
{
Logger.LogInformation($"Processing batch of {batch.Count} messages");
// Bulk processing logic
var tasks = batch.Select(envelope => ProcessSingleMessage(envelope));
await Task.WhenAll(tasks);
// Acknowledge all messages in batch
var ackTasks = batch.Select(e => e.Context.AckAsync());
await Task.WhenAll(ackTasks);
}
Stream Aggregation Pattern
Aggregate streaming data over time windows.
class StreamAggregator:
def __init__(self, window_seconds: int = 60):
self.window_seconds = window_seconds
self.current_window = {}
self.window_start = datetime.utcnow()
async def aggregate_stream(self, pattern: str):
async for envelope in self.message_bus.stream(pattern, MetricData):
metric = envelope.message
context = envelope.context
# Check if we need to flush the window
if (datetime.utcnow() - self.window_start).total_seconds() > self.window_seconds:
await self.flush_window()
# Add to current window
key = metric.name
if key not in self.current_window:
self.current_window[key] = {
'sum': 0,
'count': 0,
'min': float('inf'),
'max': float('-inf'),
'first_timestamp': context.timestamp
}
stats = self.current_window[key]
stats['sum'] += metric.value
stats['count'] += 1
stats['min'] = min(stats['min'], metric.value)
stats['max'] = max(stats['max'], metric.value)
stats['last_timestamp'] = context.timestamp
# Acknowledge message
await context.ack()
async def flush_window(self):
if not self.current_window:
return
# Calculate aggregates
aggregates = []
for name, stats in self.current_window.items():
aggregates.append({
'name': name,
'average': stats['sum'] / stats['count'],
'min': stats['min'],
'max': stats['max'],
'count': stats['count'],
'window_start': self.window_start,
'window_end': datetime.utcnow()
})
# Publish aggregated results
await self.message_bus.publish(
AggregatedMetrics(aggregates),
MessageOptions(topic="metrics.aggregated")
)
# Reset window
self.current_window = {}
self.window_start = datetime.utcnow()
Stream Filtering and Transformation
Filter and transform streams before processing.
// Extension methods for stream processing
public static class StreamExtensions
{
public static async IAsyncEnumerable> Where(
this IAsyncEnumerable> source,
Func, bool> predicate) where T : class
{
await foreach (var item in source)
{
if (predicate(item))
yield return item;
}
}
public static async IAsyncEnumerable> Select(
this IAsyncEnumerable> source,
Func transform)
where T : class
where TResult : class
{
await foreach (var envelope in source)
{
yield return new MessageEnvelope
{
Message = transform(envelope.Message),
Context = envelope.Context
};
}
}
}
// Usage example
var criticalAlerts = MessageBus.StreamAsync("sensors.*", cancellationToken)
.Where(envelope => envelope.Message.Value > envelope.Message.Threshold)
.Select(data => new Alert
{
Level = "Critical",
Source = data.SensorId,
Value = data.Value,
Message = $"Threshold exceeded: {data.Value} > {data.Threshold}"
});
await foreach (var alert in criticalAlerts)
{
await HandleAlert(alert.Message);
await alert.Context.AckAsync();
}
Message Options and Headers
Publishing with Options
Enrich messages with metadata using MessageOptions.
// Publish with full options
await MessageBus.PublishAsync(
new DataProcessed
{
RecordCount = 1000,
Duration = processingTime
},
new MessageOptions
{
Topic = "processing.completed",
Priority = MessagePriority.High,
Expiration = TimeSpan.FromMinutes(5),
CorrelationId = request.CorrelationId,
Persistent = true,
Headers = new Dictionary
{
["version"] = "2.0",
["source-system"] = "data-processor",
["batch-id"] = batchId.ToString(),
["retry-count"] = "0"
}
}
);
// Priority levels
public enum MessagePriority
{
Low = 0, // Background tasks
Normal = 1, // Regular processing
High = 2, // Important updates
Critical = 3 // System alerts
}
# Publish with full options
await self.message_bus.publish(
DataProcessed(
record_count=1000,
duration=processing_time
),
MessageOptions(
topic="processing.completed",
priority=MessagePriority.HIGH,
expiration=timedelta(minutes=5),
correlation_id=request.correlation_id,
persistent=True,
headers={
"version": "2.0",
"source-system": "data-processor",
"batch-id": str(batch_id),
"retry-count": "0"
}
)
)
# Priority levels
class MessagePriority(Enum):
LOW = 0 # Background tasks
NORMAL = 1 # Regular processing
HIGH = 2 # Important updates
CRITICAL = 3 # System alerts
Streaming Best Practices
- Handle Backpressure: Implement buffering or rate limiting for high-volume streams
- Set Timeouts: Always use cancellation tokens to prevent infinite streams
- Batch Processing: Group messages for efficient processing
- Error Recovery: Implement retry logic with exponential backoff
- Monitor Stream Health: Track message rates and processing times
- Clean Shutdown: Properly dispose of stream subscriptions
- Memory Management: Be mindful of buffer sizes with large messages
- Correlation Tracking: Use correlation IDs for request tracing
- Version Headers: Include version info for backward compatibility
- Acknowledge Strategically: Balance between data safety and performance
Message Bus Patterns
Advanced messaging patterns for building scalable, maintainable module communication.
Message Serialization
Serialization Format Options
Choose the serialization format that best suits your performance and interoperability needs:
- JSON: Human-readable, widely compatible, ideal for debugging and cross-language communication
- MessagePack: Binary format, more compact and faster than JSON, good for high-frequency messages
- Protocol Buffers: Strongly-typed binary format, best performance, requires schema definition
Configuring Serialization Format
You can specify the serialization format in your module manifest or programmatically:
# In nexus-manifest.yaml
modules:
- name: "high-frequency-sensor"
serialization: "messagepack" # Options: json, messagepack, protobuf
- name: "debug-module"
serialization: "json" # Default if not specified
Programmatic Control
// C# - Override default serialization
public class MyModule : ModuleBase
{
protected override void OnConfigure(ModuleConfiguration config)
{
// Set module-wide default
config.DefaultSerialization = SerializationFormat.MessagePack;
// Or configure per-topic
config.TopicSerializations["sensors.highfreq.*"] = SerializationFormat.MessagePack;
config.TopicSerializations["debug.*"] = SerializationFormat.Json;
}
// Send with specific format
await Messages.PublishAsync("sensors.data", payload,
options: new PublishOptions { Format = SerializationFormat.MessagePack });
}
// Python - Configure serialization
class MyModule(Module):
def configure(self, config):
# Set default format
config.default_serialization = 'messagepack'
# Configure per-topic
config.topic_serializations['sensors.highfreq.*'] = 'messagepack'
config.topic_serializations['debug.*'] = 'json'
async def send_data(self, data):
# Use default format
await self.messages.publish('sensors.data', data)
# Override for specific message
await self.messages.publish('debug.info', data, format='json')
Format Selection Guidelines
- Use JSON when:
- Human readability is important
- Debugging or development phase
- Cross-platform compatibility is critical
- Message frequency is low to moderate
- Use MessagePack when:
- Performance is important but not critical
- Message size needs to be reduced
- Working with dynamic/untyped data
- Need binary format without schema complexity
- Use Protocol Buffers when:
- Maximum performance is required
- Strong typing and schema validation needed
- Working with complex nested structures
- Building high-frequency trading or control systems
Communication Patterns Overview
Available Patterns
Pattern | Description | Use Case | Delivery Guarantee |
---|---|---|---|
Publish/Subscribe | One-to-many broadcasting | Event notifications, telemetry data | At most once |
Request/Response | Synchronous RPC-style calls | Queries, commands with results | Exactly once |
Fire-and-Forget | Asynchronous one-way messages | Commands, logging, metrics | At most once |
Streaming | Continuous data flow | Real-time data, large transfers | Ordered delivery |
Publish/Subscribe Pattern
Overview
The publish/subscribe pattern allows modules to broadcast messages to multiple subscribers without knowing who will receive them.
One publisher broadcasts messages to multiple subscribers through a topic
// Publishing messages
public class SensorModule : ModuleBase
{
private async Task PublishSensorData()
{
var data = new SensorReading
{
SensorId = "temp-001",
Value = 23.5,
Unit = "celsius",
Timestamp = DateTime.UtcNow
};
// Publish to a specific topic
await Messages.SendAsync("sensors.temperature.zone1", data);
}
}
// Subscribing to messages
public class MonitorModule : ModuleBase
{
protected override void OnInitialized()
{
// Subscribe to specific topic
Messages.SubscribeAsync("sensors.temperature.zone1", HandleTemperature);
// Subscribe with wildcards
Messages.SubscribeAsync("sensors.*.zone1", HandleAnySensorInZone1);
Messages.SubscribeAsync("sensors.temperature.*", HandleAllTemperatures);
Messages.SubscribeAsync("sensors.**", HandleAllSensorData);
}
private async Task HandleTemperature(Message message)
{
var reading = message.GetPayload<SensorReading>();
Logger.LogInformation($"Temperature: {reading.Value}{reading.Unit}");
}
}
# Publishing messages
class SensorModule(Module):
async def publish_sensor_data(self):
data = {
"sensor_id": "temp-001",
"value": 23.5,
"unit": "celsius",
"timestamp": datetime.now().isoformat()
}
# Publish to a specific topic
await self.publish("sensors.temperature.zone1", data)
# Subscribing to messages
class MonitorModule(Module):
def on_initialized(self):
# Subscribe to specific topic
self.subscribe("sensors.temperature.zone1", self.handle_temperature)
# Subscribe with wildcards
self.subscribe("sensors.*.zone1", self.handle_any_sensor_zone1)
self.subscribe("sensors.temperature.*", self.handle_all_temperatures)
self.subscribe("sensors.**", self.handle_all_sensor_data)
async def handle_temperature(self, message):
reading = message.payload
self.logger.info(f"Temperature: {reading['value']}{reading['unit']}")
// Publishing messages
class SensorModule : public nexus::ModuleBase {
void publish_sensor_data() {
SensorReading data{
.sensor_id = "temp-001",
.value = 23.5,
.unit = "celsius",
.timestamp = std::chrono::system_clock::now()
};
// Publish to a specific topic
messages()->send("sensors.temperature.zone1", data);
}
};
// Subscribing to messages
class MonitorModule : public nexus::ModuleBase {
protected:
void on_initialized() override {
// Subscribe to specific topic
messages()->subscribe("sensors.temperature.zone1",
[this](const nexus::Message& msg) {
handle_temperature(msg);
});
// Subscribe with wildcards
messages()->subscribe("sensors.*.zone1",
[this](const nexus::Message& msg) {
handle_any_sensor_zone1(msg);
});
}
private:
void handle_temperature(const nexus::Message& message) {
auto reading = message.get_payload<SensorReading>();
logger()->info("Temperature: {}{}", reading.value, reading.unit);
}
};
% Publishing messages
classdef SensorModule < Module
methods
function publishSensorData(obj)
data = struct(...
'sensorId', 'temp-001', ...
'value', 23.5, ...
'unit', 'celsius', ...
'timestamp', datetime('now') ...
);
% Publish to a specific topic
obj.publish('sensors.temperature.zone1', data);
end
end
end
% Subscribing to messages
classdef MonitorModule < Module
methods (Access = protected)
function onInitialized(obj)
% Subscribe to specific topic
obj.subscribe('sensors.temperature.zone1', @obj.handleTemperature);
% Subscribe with wildcards
obj.subscribe('sensors.*.zone1', @obj.handleAnySensorZone1);
obj.subscribe('sensors.temperature.*', @obj.handleAllTemperatures);
obj.subscribe('sensors.**', @obj.handleAllSensorData);
end
end
methods (Access = private)
function handleTemperature(obj, message)
reading = message.payload;
fprintf('Temperature: %.1f%s\n', reading.value, reading.unit);
end
end
end
// Publish/Subscribe Pattern in LabVIEW
//
// Publishing Messages:
// 1. Create message data cluster with fields:
// - SensorID (string): "temp-001"
// - Value (numeric): 23.5
// - Unit (string): "celsius"
// - Timestamp (timestamp)
//
// 2. Use Nexus.Publish VI
// - Wire topic string: "sensors.temperature.zone1"
// - Wire message data cluster
// - Message will be sent to all subscribers
//
// Subscribing to Messages:
// 1. Use Nexus.Subscribe VI in initialization
// - Wire topic string with patterns:
// * "sensors.temperature.zone1" - exact match
// * "sensors.*.zone1" - any sensor in zone1
// * "sensors.temperature.*" - all temperature sensors
// * "sensors.**" - all sensor data
//
// 2. Create Message Handler VI
// - Input: Message cluster (topic, payload, timestamp)
// - Extract payload data
// - Process as needed
// - No response required
//
// 3. Register handler with subscription
// - Wire VI reference to Subscribe VI
// - Handler called for each matching message
//
// Example Implementation:
// Main VI flow:
// 1. Initialize Module -> Subscribe to topics
// 2. Event Loop -> Publish sensor data periodically
// 3. Message handlers -> Process received messages
// 4. Shutdown -> Unsubscribe and cleanup
Topic Naming Conventions
Pattern | Description | Example | Matches |
---|---|---|---|
exact.topic.name |
Exact match only | sensors.temperature.zone1 |
Only sensors.temperature.zone1 |
topic.* |
Single level wildcard | sensors.*.zone1 |
sensors.temperature.zone1 sensors.pressure.zone1 |
topic.** |
Multi-level wildcard | sensors.** |
sensors.temperature.zone1 sensors.pressure.zone2.rack3 |
Request/Response Pattern
Overview
The request/response pattern enables synchronous communication where a module sends a request and waits for a response.
Synchronous request/response communication with the requester waiting for a response
// Making requests
public class ClientModule : ModuleBase
{
private async Task QueryData()
{
var request = new DataQuery
{
StartTime = DateTime.UtcNow.AddHours(-1),
EndTime = DateTime.UtcNow,
SensorId = "temp-001"
};
try
{
// Send request and wait for response (default 30s timeout)
var response = await Messages.RequestAsync<DataResponse>(
"data.query", request);
if (response != null)
{
Logger.LogInformation($"Received {response.DataPoints.Count} points");
}
}
catch (TimeoutException)
{
Logger.LogError("Request timed out");
}
}
}
// Handling requests
public class DataModule : ModuleBase
{
protected override void OnInitialized()
{
// Register request handler
Messages.RegisterHandlerAsync("data.query", HandleDataQuery);
}
private async Task<object> HandleDataQuery(Message message)
{
var query = message.GetPayload<DataQuery>();
// Process the query
var dataPoints = await FetchData(query);
// Return response
return new DataResponse
{
DataPoints = dataPoints,
QueryTime = DateTime.UtcNow
};
}
}
# Making requests
class ClientModule(Module):
async def query_data(self):
request = {
"start_time": (datetime.now() - timedelta(hours=1)).isoformat(),
"end_time": datetime.now().isoformat(),
"sensor_id": "temp-001"
}
try:
# Send request and wait for response
response = await self.request("data.query", request, timeout=30)
if response:
self.logger.info(f"Received {len(response['data_points'])} points")
except TimeoutError:
self.logger.error("Request timed out")
# Handling requests
class DataModule(Module):
def on_initialized(self):
# Register request handler
self.register_handler("data.query", self.handle_data_query)
async def handle_data_query(self, message):
query = message.payload
# Process the query
data_points = await self.fetch_data(query)
# Return response
return {
"data_points": data_points,
"query_time": datetime.now().isoformat()
}
// Making requests
class ClientModule : public nexus::ModuleBase {
void query_data() {
DataQuery request{
.start_time = std::chrono::system_clock::now() - 1h,
.end_time = std::chrono::system_clock::now(),
.sensor_id = "temp-001"
};
// Send request and wait for response
auto result = messages()->request<DataQuery, DataResponse>(
"data.query", request, 30s);
if (result.is_ok()) {
auto response = result.value();
logger()->info("Received {} points", response.data_points.size());
} else {
logger()->error("Request failed: {}", result.error().message());
}
}
};
// Handling requests
class DataModule : public nexus::ModuleBase {
protected:
void on_initialized() override {
// Register request handler
messages()->register_handler("data.query",
[this](const nexus::Message& msg) {
return handle_data_query(msg);
});
}
private:
nexus::Result<DataResponse> handle_data_query(const nexus::Message& message) {
auto query = message.get_payload<DataQuery>();
// Process the query
auto data_points = fetch_data(query);
// Return response
return DataResponse{
.data_points = std::move(data_points),
.query_time = std::chrono::system_clock::now()
};
}
};
% Making requests
classdef ClientModule < Module
methods
function queryData(obj)
% Create request
request = struct(...
'startTime', datetime('now') - hours(1), ...
'endTime', datetime('now'), ...
'sensorId', 'temp-001' ...
);
try
% Send request and wait for response
response = obj.request('data.query', request, 30);
if ~isempty(response)
fprintf('Received %d points\n', length(response.dataPoints));
end
catch ME
if strcmp(ME.identifier, 'Nexus:Timeout')
obj.logger.error('Request timed out');
else
rethrow(ME);
end
end
end
end
end
% Handling requests
classdef DataModule < Module
methods (Access = protected)
function onInitialized(obj)
% Register request handler
obj.registerHandler('data.query', @obj.handleDataQuery);
end
end
methods (Access = private)
function response = handleDataQuery(obj, message)
query = message.payload;
% Process the query
dataPoints = obj.fetchData(query);
% Return response
response = struct(...
'dataPoints', dataPoints, ...
'queryTime', datetime('now') ...
);
end
end
end
// Request/Response Pattern in LabVIEW
//
// Making Requests:
// 1. Use Nexus.Request VI
// 2. Wire topic string: "data.query"
// 3. Wire request data cluster
// 4. Set timeout (default 30s)
// 5. Handle response or timeout error
//
// Request Data Cluster:
// - StartTime (timestamp)
// - EndTime (timestamp)
// - SensorID (string)
//
// Response Data Cluster:
// - DataPoints (array)
// - QueryTime (timestamp)
//
// Handling Requests:
// 1. Register handler using Nexus.RegisterHandler VI
// 2. Wire topic: "data.query"
// 3. Wire handler VI reference
// 4. Handler VI must:
// - Accept Message cluster input
// - Return Response variant
// - Complete within timeout
//
// Example Handler Implementation:
// - Extract query from Message.Payload
// - Process data request
// - Build response cluster
// - Return as variant
Fire-and-Forget Pattern
Overview
Fire-and-forget is for one-way messages where the sender doesn't need confirmation or a response. It's essentially publish/subscribe with a single intended recipient.
One-way asynchronous messages with no response expected
// Sending commands without waiting for response
public class ControlModule : ModuleBase
{
private async Task SendCommands()
{
// Start pump - fire and forget
await Messages.SendAsync("control.pump.start", new
{
PumpId = "pump-001",
Speed = 1500,
RampTime = 5
});
// Log event - fire and forget
await Messages.SendAsync("logging.event", new
{
EventType = "PUMP_STARTED",
Timestamp = DateTime.UtcNow,
Details = "Pump started at 1500 RPM"
});
// Update metrics - fire and forget
await Messages.SendAsync("metrics.update", new
{
MetricName = "pump.speed",
Value = 1500,
Unit = "RPM"
});
// Continue immediately without waiting for any responses
Logger.LogInformation("Commands sent, continuing operation");
}
}
// Receiving fire-and-forget messages
public class LoggerModule : ModuleBase
{
protected override void OnInitialized()
{
// Subscribe to logging events
Messages.SubscribeAsync("logging.event", async (message) =>
{
var evt = message.GetPayload();
await StoreEvent(evt);
// No response sent back
});
}
}
# Sending commands without waiting for response
class ControlModule(Module):
async def send_commands(self):
# Start pump - fire and forget
await self.publish("control.pump.start", {
"pump_id": "pump-001",
"speed": 1500,
"ramp_time": 5
})
# Log event - fire and forget
await self.publish("logging.event", {
"event_type": "PUMP_STARTED",
"timestamp": datetime.now().isoformat(),
"details": "Pump started at 1500 RPM"
})
# Update metrics - fire and forget
await self.publish("metrics.update", {
"metric_name": "pump.speed",
"value": 1500,
"unit": "RPM"
})
# Continue immediately without waiting for any responses
self.logger.info("Commands sent, continuing operation")
# Receiving fire-and-forget messages
class LoggerModule(Module):
def on_initialized(self):
# Subscribe to logging events
self.subscribe("logging.event", self.handle_log_event)
async def handle_log_event(self, message):
event = message.payload
await self.store_event(event)
# No response sent back
// Sending commands without waiting for response
class ControlModule : public nexus::ModuleBase {
void send_commands() {
// Start pump - fire and forget
messages()->send("control.pump.start", {
{"pump_id", "pump-001"},
{"speed", 1500},
{"ramp_time", 5}
});
// Log event - fire and forget
messages()->send("logging.event", {
{"event_type", "PUMP_STARTED"},
{"timestamp", std::chrono::system_clock::now()},
{"details", "Pump started at 1500 RPM"}
});
// Update metrics - fire and forget
messages()->send("metrics.update", {
{"metric_name", "pump.speed"},
{"value", 1500},
{"unit", "RPM"}
});
// Continue immediately without waiting for any responses
logger()->info("Commands sent, continuing operation");
}
};
// Receiving fire-and-forget messages
class LoggerModule : public nexus::ModuleBase {
protected:
void on_initialized() override {
// Subscribe to logging events
messages()->subscribe("logging.event",
[this](const nexus::Message& msg) {
handle_log_event(msg);
});
}
private:
void handle_log_event(const nexus::Message& message) {
auto event = message.get_payload();
store_event(event);
// No response sent back
}
};
% Sending commands without waiting for response
classdef ControlModule < Module
methods
function sendCommands(obj)
% Start pump - fire and forget
obj.publish('control.pump.start', struct(...
'pumpId', 'pump-001', ...
'speed', 1500, ...
'rampTime', 5 ...
));
% Log event - fire and forget
obj.publish('logging.event', struct(...
'eventType', 'PUMP_STARTED', ...
'timestamp', datetime('now'), ...
'details', 'Pump started at 1500 RPM' ...
));
% Update metrics - fire and forget
obj.publish('metrics.update', struct(...
'metricName', 'pump.speed', ...
'value', 1500, ...
'unit', 'RPM' ...
));
% Continue immediately without waiting for any responses
obj.logger.info('Commands sent, continuing operation');
end
end
end
% Receiving fire-and-forget messages
classdef LoggerModule < Module
methods (Access = protected)
function onInitialized(obj)
% Subscribe to logging events
obj.subscribe('logging.event', @obj.handleLogEvent);
end
end
methods (Access = private)
function handleLogEvent(obj, message)
event = message.payload;
obj.storeEvent(event);
% No response sent back
end
end
end
// Fire-and-Forget Pattern in LabVIEW
//
// Sending Commands:
// 1. Use Nexus.Publish VI (same as pub/sub)
// 2. Wire topic string: "control.pump.start"
// 3. Wire command data cluster
// 4. Continue immediately - no wait
//
// Command Data Examples:
// Pump Control:
// - PumpID (string): "pump-001"
// - Speed (numeric): 1500
// - RampTime (numeric): 5
//
// Log Event:
// - EventType (string): "PUMP_STARTED"
// - Timestamp (timestamp)
// - Details (string)
//
// Metrics Update:
// - MetricName (string): "pump.speed"
// - Value (numeric): 1500
// - Unit (string): "RPM"
//
// Receiving Commands:
// 1. Subscribe using Nexus.Subscribe VI
// 2. Process in event structure
// 3. Do NOT send response
// 4. Handle errors locally
//
// Best Practices:
// - Use for non-critical operations
// - Log important commands locally
// - Implement retry logic if needed
// - Monitor for missed messages
Streaming Pattern
Overview
The streaming pattern enables continuous data flow between modules, ideal for real-time data or large data transfers.
// Stream producer
public class DataStreamModule : ModuleBase
{
private async Task StreamData()
{
// Create a stream
var stream = await Messages.CreateStreamAsync("data.realtime");
try
{
while (!cancellationToken.IsCancellationRequested)
{
var data = ReadSensorData();
// Write to stream
await stream.WriteAsync(data);
await Task.Delay(100); // 10Hz data rate
}
}
finally
{
// Close stream when done
await stream.CompleteAsync();
}
}
}
// Stream consumer
public class StreamConsumerModule : ModuleBase
{
protected override void OnInitialized()
{
Messages.SubscribeToStreamAsync("data.realtime", HandleStream);
}
private async Task HandleStream(IAsyncEnumerable stream)
{
await foreach (var message in stream)
{
var data = message.GetPayload();
ProcessRealtimeData(data);
}
}
}
# Stream producer
class DataStreamModule(Module):
async def stream_data(self):
# Create a stream
stream = await self.create_stream("data.realtime")
try:
while not self.is_stopping:
data = self.read_sensor_data()
# Write to stream
await stream.write(data)
await asyncio.sleep(0.1) # 10Hz data rate
finally:
# Close stream when done
await stream.complete()
# Stream consumer
class StreamConsumerModule(Module):
def on_initialized(self):
self.subscribe_to_stream("data.realtime", self.handle_stream)
async def handle_stream(self, stream):
async for message in stream:
data = message.payload
self.process_realtime_data(data)
// Stream producer
class SensorModule : public nexus::ModuleBase {
void start_streaming() {
// Create stream
auto stream = messages()->create_stream("sensors.realtime");
while (is_running()) {
auto data = read_sensor();
// Send data to stream
stream->send({
{"sensor_id", sensor_id_},
{"value", data.value},
{"timestamp", std::chrono::system_clock::now()}
});
std::this_thread::sleep_for(100ms);
}
// End stream
stream->close();
}
};
// Stream consumer
class AnalyticsModule : public nexus::ModuleBase {
void on_initialized() override {
// Subscribe to stream
messages()->stream_subscribe("sensors.realtime",
[this](auto stream) {
handle_stream(std::move(stream));
});
}
void handle_stream(nexus::Stream stream) {
// Process stream data
for (const auto& message : stream) {
auto data = message.get_payload();
process_realtime_data(data);
}
}
};
% Stream producer
classdef SensorModule < Module
properties
streaming = false
streamHandle
end
methods
function startStreaming(obj)
% Create stream
obj.streamHandle = obj.createStream('sensors.realtime');
obj.streaming = true;
while obj.streaming
% Read sensor data
data = struct(...
'sensorId', obj.sensorId, ...
'value', readSensor(), ...
'timestamp', datetime('now') ...
);
% Send to stream
obj.streamHandle.send(data);
% Control rate
pause(0.1); % 10 Hz
end
% Close stream
obj.streamHandle.close();
end
function stopStreaming(obj)
obj.streaming = false;
end
end
end
% Stream consumer
classdef AnalyticsModule < Module
methods (Access = protected)
function onInitialized(obj)
% Subscribe to stream
obj.streamSubscribe('sensors.realtime', @obj.handleStream);
end
end
methods (Access = private)
function handleStream(obj, stream)
% Process stream data
while stream.isOpen()
message = stream.receive();
if ~isempty(message)
data = message.payload;
obj.processRealtimeData(data);
end
end
end
end
end
// Streaming Pattern in LabVIEW
//
// Stream Producer:
// 1. Create Stream VI
// - Wire topic: "sensors.realtime"
// - Returns stream reference
//
// 2. Producer Loop
// - Read sensor data
// - Build data cluster:
// * SensorID (string)
// * Value (numeric)
// * Timestamp (timestamp)
// - Use Stream.Send VI
// - Control loop rate (e.g., 10 Hz)
//
// 3. Close Stream
// - Use Stream.Close VI when done
// - Important for proper cleanup
//
// Stream Consumer:
// 1. Subscribe to Stream VI
// - Wire topic: "sensors.realtime"
// - Wire handler VI reference
//
// 2. Stream Handler VI
// - Input: Stream reference
// - While loop with Stream.IsOpen check
// - Stream.Receive VI to get messages
// - Process each message payload
// - Exit when stream closes
//
// Key Considerations:
// - Buffering: Stream handles backpressure
// - Rate Control: Producer should limit rate
// - Error Handling: Check stream status
// - Memory: Process data without accumulating
//
// Example Flow:
// Producer VI:
// 1. Initialize -> Create Stream
// 2. While Running -> Send sensor data at 10Hz
// 3. Shutdown -> Close stream
//
// Consumer VI:
// 1. Initialize -> Subscribe to stream
// 2. Handler Loop -> Process incoming data
// 3. Stream closes -> Exit handler
- Streams maintain order of messages
- Always close streams when done to free resources
- Handle backpressure - consumers might be slower than producers
- Streams are unicast (one producer, one consumer)
Pattern Selection Guide
When to Use Each Pattern
Use Publish/Subscribe When:
- Multiple modules need the same information
- Broadcasting events or state changes
- Decoupling producers from consumers
- Building event-driven architectures
Examples: Sensor readings, status updates, alerts, telemetry
Use Request/Response When:
- You need a specific answer to a query
- Performing operations that return results
- Implementing command patterns with confirmation
- Building RPC-style interfaces
Examples: Database queries, configuration requests, command execution with status
Use Fire-and-Forget When:
- No response is needed
- Maximum performance is required
- Operations are non-critical
- Logging or metrics collection
Examples: Log messages, metrics, non-critical commands
Use Streaming When:
- Continuous data flow is needed
- Real-time data processing
- Large data transfers
- Maintaining message order is critical
Examples: Video streams, audio data, high-frequency sensor data
Error Handling
Pattern-Specific Error Handling
Pattern | Error Type | Handling Strategy |
---|---|---|
Publish/Subscribe | No subscribers | Message is dropped silently (normal behavior) |
Subscriber error | Logged but doesn't affect publisher | |
Request/Response | No handler | Immediate error returned to caller |
Timeout | TimeoutException after specified duration | |
Handler exception | Error propagated to caller | |
Fire-and-Forget | Delivery failure | Logged but sender not notified |
No handler | Message dropped, logged if configured | |
Streaming | Consumer too slow | Backpressure applied or buffer overflow |
Stream broken | Both sides notified, stream terminated | |
Producer failure | Stream completed with error status |
Best Practices for Error Handling
// C# Complete error handling
public class RobustModule : ModuleBase
{
protected override void OnInitialized()
{
// Subscribe with error handling
Messages.SubscribeAsync("data.*", async (message) =>
{
try
{
await ProcessMessage(message);
}
catch (Exception ex)
{
Logger.LogError(ex, "Failed to process message on topic {Topic}",
message.Topic);
// Continue processing other messages
}
});
}
private async Task MakeRobustRequest()
{
// Use SDK-provided retry policy with exponential backoff
var retryPolicy = Recovery.CreateRetryPolicy(
maxAttempts: 3,
delay: TimeSpan.FromSeconds(1),
backoffMultiplier: 2.0,
maxDelay: TimeSpan.FromSeconds(30),
retryOn: new[] { typeof(TimeoutException) }
);
var response = await retryPolicy.ExecuteAsync(async () =>
{
return await Messages.RequestAsync(
"data.query",
new { id = "123" },
TimeSpan.FromSeconds(10));
},
onRetry: (attempt, delay, exception) =>
{
Logger.LogWarning("Request timeout, retry {Attempt} after {Delay}ms",
attempt, delay.TotalMilliseconds);
});
if (response != null)
{
ProcessResponse(response);
}
}
}
Performance Considerations
Pattern Performance Characteristics
Pattern | Latency | Throughput | Resource Usage |
---|---|---|---|
Publish/Subscribe | Low | High | Low (scales with subscribers) |
Request/Response | Medium | Medium | Medium (connection per request) |
Fire-and-Forget | Lowest | Highest | Lowest |
Streaming | Low | Very High | High (maintains connection) |
Optimization Tips
- Batch messages when possible to reduce overhead
- Use appropriate timeouts for request/response to avoid blocking
- Consider message size - large messages impact all patterns
- Monitor queue depths to detect slow consumers
- Use streaming for continuous high-volume data instead of many individual messages
Module Lifecycle
The Nexus host manages the complete lifecycle of your module. You just implement the interface methods - the host handles everything else.
Lifecycle Overview
Module States
Your module progresses through the following states during its lifetime:
Lifecycle Methods
Method Overview
The SDK provides these lifecycle methods that you can override in your module:
Method | When Called | Purpose | Required |
---|---|---|---|
OnInitialized() |
After module is loaded and dependencies injected | Set up message subscriptions, initialize resources | Yes (abstract) |
OnStarting() |
Before module enters Running state | Start background tasks, open connections | No (virtual) |
OnStopping() |
When module is shutting down | Clean up resources, close connections | No (virtual) |
GetHealthAsync() |
Periodically based on health check config | Report module health status | No (virtual) |
Implementation Examples
public class DataCollectorModule : ModuleBase
{
private Timer? _collectionTimer;
private IDataService? _dataService;
protected override void OnInitialized()
{
Logger.LogInformation("Initializing Data Collector Module");
// Subscribe to control messages
Messages.SubscribeAsync("control.datacollector.*", HandleControlMessage);
// Subscribe to data requests
Messages.RegisterHandlerAsync("data.request", HandleDataRequest);
// Initialize services
_dataService = new DataService(Configuration);
}
protected override void OnStarting()
{
Logger.LogInformation("Starting data collection");
// Start periodic data collection
var interval = Configuration.GetValue("CollectionInterval", 5000);
_collectionTimer = new Timer(CollectData, null, 0, interval);
// Open database connection
_dataService?.Connect();
}
protected override void OnStopping()
{
Logger.LogInformation("Stopping data collection");
// Stop timer
_collectionTimer?.Dispose();
// Close connections
_dataService?.Disconnect();
// Ensure all pending operations complete
_dataService?.FlushPendingData();
}
public override Task GetHealthAsync()
{
// Check if service is connected
if (_dataService?.IsConnected != true)
{
return Task.FromResult(HealthStatus.Unhealthy("Database disconnected"));
}
// Check if collection is running
if (_collectionTimer == null)
{
return Task.FromResult(HealthStatus.Unhealthy("Collection not running"));
}
return Task.FromResult(HealthStatus.Healthy("All systems operational"));
}
private async void CollectData(object? state)
{
try
{
var data = await _dataService.CollectDataAsync();
await Messages.SendAsync("data.collected", data);
}
catch (Exception ex)
{
Logger.LogError(ex, "Error collecting data");
}
}
}
from nexus_sdk import Module, HealthStatus
import asyncio
from datetime import datetime
class DataCollectorModule(Module):
def __init__(self):
super().__init__()
self.collection_task = None
self.is_connected = False
self.data_service = None
def on_initialized(self):
"""Called after module is loaded"""
self.logger.info("Initializing Data Collector Module")
# Subscribe to control messages
self.subscribe("control.datacollector.*", self.handle_control_message)
# Register request handler
self.register_handler("data.request", self.handle_data_request)
# Initialize service
from .data_service import DataService
self.data_service = DataService(self.config)
async def on_starting(self):
"""Called before entering running state"""
self.logger.info("Starting data collection")
# Connect to database
await self.data_service.connect()
self.is_connected = True
# Start collection task
self.collection_task = asyncio.create_task(self.collect_data_loop())
async def on_stopping(self):
"""Called when shutting down"""
self.logger.info("Stopping data collection")
# Cancel collection task
if self.collection_task:
self.collection_task.cancel()
try:
await self.collection_task
except asyncio.CancelledError:
pass
# Disconnect from database
if self.data_service:
await self.data_service.disconnect()
self.is_connected = False
async def get_health(self):
"""Report module health"""
if not self.is_connected:
return HealthStatus.unhealthy("Database disconnected")
if not self.collection_task or self.collection_task.done():
return HealthStatus.unhealthy("Collection not running")
return HealthStatus.healthy("All systems operational")
async def collect_data_loop(self):
"""Background data collection"""
interval = self.config.get('collection_interval', 5)
while True:
try:
data = await self.data_service.collect_data()
await self.publish("data.collected", {
"timestamp": datetime.now().isoformat(),
"data": data
})
except Exception as e:
self.logger.error(f"Error collecting data: {e}")
await asyncio.sleep(interval)
class DataCollectorModule : public nexus::ModuleBase {
private:
std::unique_ptr data_service_;
std::thread collection_thread_;
std::atomic running_{false};
std::condition_variable cv_;
std::mutex mutex_;
protected:
void on_initialized() override {
logger()->info("Initializing Data Collector Module");
// Subscribe to control messages
messages()->subscribe("control.datacollector.*",
[this](const nexus::Message& msg) {
handle_control_message(msg);
});
// Register request handler
messages()->register_handler("data.request",
[this](const nexus::Message& msg) {
return handle_data_request(msg);
});
// Initialize service
data_service_ = std::make_unique(config());
}
void on_starting() override {
logger()->info("Starting data collection");
// Connect to database
if (!data_service_->connect()) {
throw std::runtime_error("Failed to connect to database");
}
// Start collection thread
running_ = true;
collection_thread_ = std::thread([this]() {
collect_data_loop();
});
}
void on_stopping() override {
logger()->info("Stopping data collection");
// Signal thread to stop
{
std::lock_guard lock(mutex_);
running_ = false;
}
cv_.notify_all();
// Wait for thread to finish
if (collection_thread_.joinable()) {
collection_thread_.join();
}
// Disconnect from database
if (data_service_) {
data_service_->disconnect();
}
}
nexus::Result get_health() override {
// Check database connection
if (!data_service_ || !data_service_->is_connected()) {
return nexus::HealthStatus::unhealthy("Database disconnected");
}
// Check collection thread
if (!running_) {
return nexus::HealthStatus::unhealthy("Collection not running");
}
return nexus::HealthStatus::healthy("All systems operational");
}
private:
void collect_data_loop() {
auto interval = config()->get("collection_interval", 5);
while (running_) {
try {
auto data = data_service_->collect_data();
messages()->send("data.collected", DataMessage{
.timestamp = std::chrono::system_clock::now(),
.data = std::move(data)
});
} catch (const std::exception& e) {
logger()->error("Error collecting data: {}", e.what());
}
// Wait for interval or stop signal
std::unique_lock lock(mutex_);
cv_.wait_for(lock, std::chrono::seconds(interval),
[this] { return !running_; });
}
}
};
Best Practices
Initialization Phase
- Subscribe to messages early: Set up all message subscriptions in OnInitialized
- Validate configuration: Check required configuration values and fail fast
- Initialize resources: Create service instances but don't start them yet
- Log initialization: Provide clear logging for debugging
Starting Phase
- Open connections: Establish database, network, or hardware connections
- Start background tasks: Launch timers, threads, or async tasks
- Verify dependencies: Ensure required services are available
- Handle startup failures: Throw exceptions to prevent module from running if critical resources fail
Running Phase
- Handle errors gracefully: Don't let unhandled exceptions crash your module
- Respect cancellation: Check cancellation tokens in long-running operations
- Monitor resources: Track memory, connections, and other resources
- Maintain health: Keep health status updated
Stopping Phase
- Stop gracefully: Cancel operations and wait for completion
- Clean up resources: Close connections, dispose objects, free memory
- Save state if needed: Persist any important state before shutdown
- Log shutdown: Provide clear logging of shutdown process
Error Handling
Lifecycle Error Behavior
Phase | Exception Behavior | Module State | System Impact |
---|---|---|---|
OnInitialized | Module fails to load | Not created | Critical modules stop system |
OnStarting | Module fails to start | Initialization failed | Critical modules stop system |
Running | Logged, module continues | Running (degraded) | Health checks may fail |
OnStopping | Logged, shutdown continues | Stopped | Resources may leak |
GetHealthAsync | Returns unhealthy status | No change | May trigger restart |
critical: true
in the manifest), this will stop the entire NEXUS-1 system.
State Transitions
Valid State Transitions
- Created → Initializing: Automatic when module is loaded
- Initializing → Starting: After OnInitialized completes successfully
- Starting → Running: After OnStarting completes successfully
- Running → Stopping: When shutdown is requested
- Any State → Failed: When an unrecoverable error occurs
Health Monitoring
Implementing Health Checks
The GetHealthAsync method is called periodically based on your manifest configuration:
// Manifest configuration
healthCheck:
interval: 30s # Check every 30 seconds
timeout: 5s # Timeout after 5 seconds
failureThreshold: 3 # Unhealthy after 3 failures
// C# Implementation
public override Task GetHealthAsync()
{
var checks = new List();
// Check critical resources
if (!IsConnectedToDatabase())
checks.Add("Database disconnected");
if (!IsProcessingData())
checks.Add("Data processing stopped");
if (GetQueueDepth() > 1000)
checks.Add("Queue backlog detected");
// Return appropriate status
if (checks.Any())
return Task.FromResult(HealthStatus.Unhealthy(string.Join("; ", checks)));
return Task.FromResult(HealthStatus.Healthy());
}
Health Check Best Practices
- Keep health checks fast and lightweight
- Check only critical dependencies
- Return specific failure reasons
- Don't modify state during health checks
- Consider implementing detailed health metrics for monitoring
Real-World Examples
Industrial Automation
Smart Factory Controller
Complete implementation of a smart factory system with secure module integration, real-time monitoring, and predictive maintenance.
// C# Smart Factory Module using secure API
[Module("smart-factory", "Smart Factory Controller", "1.0.0")]
public class SmartFactoryModule : ModuleBase
{
protected override void OnInitialized()
{
// Subscribe to sensor data
Messages.SubscribeAsync("sensors.*", HandleSensorData);
// Subscribe to PLC commands
Messages.SubscribeAsync("plc.commands.*", HandlePLCCommand);
Logger.LogInformation("Smart Factory Controller initialized");
}
private async Task HandleSensorData(Message message)
{
var data = message.GetPayload();
// Process sensor data
if (data.Temperature > 80.0)
{
await Messages.SendAsync("alerts.temperature", new
{
Level = "Critical",
Value = data.Temperature,
Message = "Temperature threshold exceeded"
});
}
}
}
# Python Smart Factory Module using secure API
from nexus_sdk import module, Module, Message
from datetime import datetime
import asyncio
@module("smart-factory", "Smart Factory Controller", "1.0.0")
class SmartFactoryModule(Module):
def initialize(self):
# Subscribe to sensor data
self.subscribe("sensors.*", self.handle_sensor_data)
# Subscribe to PLC commands
self.subscribe("plc.commands.*", self.handle_plc_command)
self.logger.info("Smart Factory Controller initialized")
async def handle_sensor_data(self, message: Message):
data = message.payload
# Process sensor data
if data.get('temperature', 0) > 80.0:
await self.publish("alerts.temperature", {
"level": "Critical",
"value": data['temperature'],
"message": "Temperature threshold exceeded",
"timestamp": datetime.now().isoformat()
})
async def handle_plc_command(self, message: Message):
command = message.payload
self.logger.info(f"Received PLC command: {command}")
# Process PLC commands here
Energy Grid Manager
Distributed energy management system with load balancing, renewable integration, and demand response.
// C++ Energy Grid Module using secure API
NEXUS_MODULE("energy-grid", "Energy Grid Manager", "1.0.0")
class EnergyGridModule : public nexus::ModuleBase {
public:
nexus::Result<void> initialize() override {
// Subscribe to grid metrics
NEXUS_TRY(messages().subscribe("grid.metrics.*",
[this](const nexus::Message& msg) {
return handle_grid_metrics(msg);
}));
logger().info("Energy Grid Manager initialized");
return nexus::success();
}
private:
nexus::Result<void> handle_grid_metrics(const nexus::Message& msg) {
auto metrics = msg.payload<GridMetrics>();
// Load balancing logic
if (metrics.load_factor > 0.9) {
return messages().send("grid.control", LoadBalanceCommand{
.action = "redistribute",
.target_load = 0.75
});
}
return nexus::success();
}
};
classdef EnergyGridManager < nexus.Module
% MATLAB Energy Grid Manager using secure API
properties (Constant)
Name = 'energy-grid'
Description = 'Energy Grid Manager'
Version = '1.0.0'
end
properties (Access = private)
gridMetrics = struct()
loadHistory = []
end
methods
function obj = EnergyGridManager()
obj = obj@nexus.Module();
end
function initialize(obj)
% Subscribe to grid metrics
obj.subscribe('grid.metrics.*', @obj.handleGridMetrics);
% Schedule periodic optimization
obj.scheduleInterval(@obj.optimizeGrid, 30);
obj.logger.info('Energy Grid Manager initialized');
end
function handleGridMetrics(obj, message)
metrics = message.payload;
obj.gridMetrics = metrics;
obj.loadHistory = [obj.loadHistory, metrics.load_factor];
% Keep only last 100 samples
if length(obj.loadHistory) > 100
obj.loadHistory = obj.loadHistory(end-99:end);
end
% Load balancing logic
if metrics.load_factor > 0.9
obj.publish('grid.control', struct(...
'action', 'redistribute', ...
'target_load', 0.75, ...
'zones', metrics.affected_zones ...
));
end
end
function optimizeGrid(obj)
if ~isempty(obj.loadHistory)
% Calculate trends
trend = polyfit(1:length(obj.loadHistory), obj.loadHistory, 1);
if trend(1) > 0.01 % Rising trend
obj.publish('grid.prediction', struct(...
'trend', 'increasing', ...
'rate', trend(1), ...
'recommendation', 'prepare_additional_capacity' ...
));
end
end
end
end
end
Data Processing
Real-Time Analytics Pipeline
Stream processing pipeline with anomaly detection, time-series analysis, and secure data handling.
# Python Analytics Module using secure API
from nexus_sdk import module, Module, Message
import numpy as np
from datetime import datetime
@module("analytics-pipeline", "Analytics Pipeline", "1.0.0")
class AnalyticsPipeline(Module):
def __init__(self):
super().__init__()
self.window_size = 100
self.data_buffer = []
def initialize(self):
# Subscribe to raw data
self.subscribe("data.raw.*", self.process_data)
# Schedule periodic analysis
self.schedule_periodic(self.analyze_window, interval=5.0)
async def process_data(self, message: Message):
data = message.payload
self.data_buffer.append(data)
# Keep window size
if len(self.data_buffer) > self.window_size:
self.data_buffer.pop(0)
async def analyze_window(self):
if len(self.data_buffer) >= 10:
# Perform anomaly detection
values = np.array([d['value'] for d in self.data_buffer])
mean = np.mean(values)
std = np.std(values)
# Detect anomalies
anomalies = np.abs(values - mean) > 3 * std
if np.any(anomalies):
await self.publish("analytics.anomaly", {
"timestamp": datetime.now(),
"anomaly_count": int(np.sum(anomalies)),
"values": values[anomalies].tolist()
})
MATLAB Signal Processing
Advanced signal processing module with FFT analysis, filtering, and real-time visualization.
classdef SignalProcessor < nexus.Module
% MATLAB Signal Processing Module using secure API
properties (Constant)
Name = 'signal-processor'
Description = 'Advanced Signal Processing'
Version = '1.0.0'
end
properties (Access = private)
bufferSize = 1024
sampleRate = 44100
signalBuffer = []
end
methods
function obj = SignalProcessor()
obj = obj@nexus.Module();
end
function initialize(obj)
% Subscribe to raw signal data
obj.subscribe('signals.raw.*', @obj.processSignal);
% Schedule periodic FFT analysis
obj.scheduleInterval(@obj.analyzeSpectrum, 0.1);
obj.logger.info('Signal processor initialized');
end
function processSignal(obj, message)
data = message.payload;
% Add to circular buffer
obj.signalBuffer = [obj.signalBuffer, data.samples];
if length(obj.signalBuffer) > obj.bufferSize
obj.signalBuffer = obj.signalBuffer(end-obj.bufferSize+1:end);
end
end
function analyzeSpectrum(obj)
if length(obj.signalBuffer) >= obj.bufferSize
% Perform FFT
Y = fft(obj.signalBuffer .* hamming(length(obj.signalBuffer))');
P = abs(Y/obj.bufferSize).^2;
f = obj.sampleRate*(0:(obj.bufferSize/2))/obj.bufferSize;
% Find dominant frequencies
[peaks, locs] = findpeaks(P(1:obj.bufferSize/2+1), 'MinPeakHeight', 0.1);
% Publish results
obj.publish('signals.spectrum', struct(...
'frequencies', f(locs), ...
'magnitudes', peaks, ...
'timestamp', datetime('now') ...
));
end
end
end
end
LabVIEW Data Acquisition
High-speed data acquisition system with hardware integration and real-time control.
// LabVIEW Data Acquisition Module
// File: DataAcquisition.vi
// Module Configuration:
modules:
- name: daq-system
type: labview
path: ./modules/DataAcquisition.vi
runtime: labview-2023
capabilities:
- messages.publish
- messages.subscribe
- hardware.access
config:
sampling_rate: 100000
channels: ["ai0", "ai1", "ai2", "ai3"]
trigger_mode: "continuous"
// VI Implementation Structure:
// === Initialize Section ===
// 1. Call Nexus.Initialize.vi
// 2. Configure DAQ hardware
// - Create DAQmx Task
// - Configure analog input channels
// - Set sampling rate and timing
// 3. Register message handlers
// - Subscribe to "daq.control.*" for commands
// - Subscribe to "daq.config.*" for runtime config
// === Main Loop ===
// Event Structure with cases:
//
// Case: "DAQ Data Available"
// - Read samples from DAQ
// - Apply calibration factors
// - Package data with timestamps
// - Call Nexus.Publish.vi
// Topic: "daq.data.[channel]"
// Payload: {samples[], timestamp, channel_id}
//
// Case: "Control Message Received"
// - Parse command from Nexus.Message
// - Switch on command type:
// * "start": Begin acquisition
// * "stop": Pause acquisition
// * "configure": Update DAQ settings
// * "calibrate": Run calibration routine
//
// Case: "Error"
// - Log error via Nexus.Logger.vi
// - Publish alert: "daq.error"
// - Attempt recovery or safe shutdown
// === Cleanup Section ===
// 1. Stop DAQ Task
// 2. Clear hardware resources
// 3. Unsubscribe from all topics
// 4. Call Nexus.Shutdown.vi
// Example Usage from Other Modules:
// await messages.send("daq.control", {
// command: "configure",
// channels: ["ai0", "ai1"],
// rate: 50000
// });
Performance Optimizations
Optimize your NEXUS-1 modules for maximum performance, efficiency, and scalability. This section covers techniques for message bus optimization, resource management, and implementation strategies.
General Performance Guidelines
Key Performance Principles
- Minimize Message Size: Keep payloads small and use efficient serialization
- Batch Operations: Group multiple operations when possible
- Async Everything: Use asynchronous patterns for all I/O operations
- Cache Strategically: Cache frequently accessed data locally
- Profile and Measure: Always measure before and after optimization
Performance Metrics
Metric | Target | Description |
---|---|---|
Message Latency | < 1ms | Time from publish to receive |
Health Check Response | < 100ms | Time to respond to health check |
Startup Time | < 5s | Time from Created to Running |
Memory Overhead | < 50MB | Base memory usage per module |
CPU Usage (Idle) | < 1% | CPU usage when not processing |
Message Bus Optimization
Efficient Message Patterns
// ❌ Non-Optimized: Individual messages
public async Task SendDataPoints(List<DataPoint> points)
{
foreach (var point in points)
{
await _context.PublishAsync("sensor.data", point);
}
}
// ✅ Optimized: Batched messages
public async Task SendDataPoints(List<DataPoint> points)
{
// Batch into chunks of 100
const int batchSize = 100;
for (int i = 0; i < points.Count; i += batchSize)
{
var batch = points.Skip(i).Take(batchSize);
await _context.PublishAsync("sensor.data.batch", new
{
Count = batch.Count(),
Points = batch,
Timestamp = DateTime.UtcNow
});
}
}
// ✅ Optimized: Compression for large payloads
public async Task SendLargeData(byte[] data)
{
if (data.Length > 1024) // 1KB threshold
{
using var compressed = new MemoryStream();
using (var gzip = new GZipStream(compressed, CompressionMode.Compress))
{
await gzip.WriteAsync(data, 0, data.Length);
}
await _context.PublishAsync("data.compressed", new
{
Original = data.Length,
Compressed = compressed.Length,
Data = compressed.ToArray()
});
}
else
{
await _context.PublishAsync("data.raw", data);
}
}
# ❌ Non-Optimized: Individual messages
async def send_data_points(self, points):
for point in points:
await self.context.publish("sensor.data", point)
# ✅ Optimized: Batched messages
async def send_data_points(self, points):
batch_size = 100
for i in range(0, len(points), batch_size):
batch = points[i:i + batch_size]
await self.context.publish("sensor.data.batch", {
"count": len(batch),
"points": batch,
"timestamp": datetime.utcnow().isoformat()
})
# ✅ Optimized: Compression for large payloads
import gzip
import json
async def send_large_data(self, data):
if len(data) > 1024: # 1KB threshold
if isinstance(data, dict):
data = json.dumps(data).encode()
compressed = gzip.compress(data)
await self.context.publish("data.compressed", {
"original": len(data),
"compressed": len(compressed),
"data": compressed.hex() # Convert bytes to hex string
})
else:
await self.context.publish("data.raw", data)
# ✅ Optimized: Message pooling
class MessagePool:
def __init__(self, size=100):
self._pool = [{"data": None} for _ in range(size)]
self._index = 0
def get_message(self):
msg = self._pool[self._index]
self._index = (self._index + 1) % len(self._pool)
return msg
def send_with_pool(self, data):
msg = self.get_message()
msg["data"] = data
return msg
// ❌ Non-Optimized: Individual messages
void send_data_points(const std::vector<DataPoint>& points) {
for (const auto& point : points) {
context->publish("sensor.data", point);
}
}
// ✅ Optimized: Batched messages with move semantics
void send_data_points(std::vector<DataPoint>&& points) {
constexpr size_t batch_size = 100;
for (size_t i = 0; i < points.size(); i += batch_size) {
std::vector<DataPoint> batch;
batch.reserve(std::min(batch_size, points.size() - i));
auto end = std::min(i + batch_size, points.size());
std::move(points.begin() + i, points.begin() + end,
std::back_inserter(batch));
json msg = {
{"count", batch.size()},
{"points", std::move(batch)},
{"timestamp", std::chrono::system_clock::now()}
};
context->publish("sensor.data.batch", std::move(msg));
}
}
// ✅ Optimized: Zero-copy message passing
class ZeroCopyBuffer {
private:
std::shared_ptr<uint8_t[]> data_;
size_t size_;
public:
ZeroCopyBuffer(size_t size)
: data_(std::make_shared<uint8_t[]>(size)), size_(size) {}
void publish_zero_copy() {
// Pass shared_ptr to avoid copying
context->publish("data.zerocopy", std::move(data_));
}
};
// ✅ Optimized: Lock-free message queue
template<typename T>
class LockFreeQueue {
std::atomic<Node*> head_;
std::atomic<Node*> tail_;
struct Node {
std::atomic<T*> data;
std::atomic<Node*> next;
};
public:
void enqueue(T item) {
Node* new_node = new Node{new T(std::move(item)), nullptr};
Node* prev_tail = tail_.exchange(new_node);
prev_tail->next.store(new_node);
}
};
classdef VibrationAnalysisModule < NexusModule
properties (Access = private)
messagePool
poolIndex = 1
end
methods
function obj = OptimizedModule()
% Pre-allocate message pool
obj.messagePool = cell(100, 1);
for i = 1:100
obj.messagePool{i} = struct('data', []);
end
end
% ❌ Non-Optimized: Individual messages
function sendDataPoints_slow(obj, points)
for i = 1:length(points)
obj.publish('sensor.data', points(i));
end
end
% ✅ Optimized: Batched messages
function sendDataPoints(obj, points)
batchSize = 100;
numPoints = length(points);
for i = 1:batchSize:numPoints
endIdx = min(i + batchSize - 1, numPoints);
batch = points(i:endIdx);
msg = struct(...
'count', length(batch), ...
'points', batch, ...
'timestamp', datetime('now', 'TimeZone', 'UTC'));
obj.publish('sensor.data.batch', msg);
end
end
% ✅ Optimized: Reuse message structures
function msg = getPooledMessage(obj)
msg = obj.messagePool{obj.poolIndex};
obj.poolIndex = mod(obj.poolIndex, 100) + 1;
end
% ✅ Optimized: Vectorized operations
function processDataOptimized(obj, data)
% Use vectorized operations instead of loops
filtered = data(data > threshold); % Vectorized filter
normalized = (filtered - mean(filtered)) / std(filtered);
% Send processed data in one message
obj.publish('processed.data', normalized);
end
% ✅ Optimized: Preallocate arrays
function result = processStream(obj, streamSize)
% Preallocate for known size
result = zeros(streamSize, 1);
for i = 1:streamSize
result(i) = obj.processValue(i);
end
end
end
end
// Performance Optimization Patterns in LabVIEW
//
// ❌ Non-Optimized: Individual Message Sending
// 1. For Loop iterating through array
// 2. Publish VI inside loop
// 3. High overhead per message
//
// ✅ Optimized: Batch Message Sending
// 1. Use Array Subset to create batches
// 2. Bundle batch with metadata
// 3. Single Publish call per batch
//
// Implementation:
// - Wire array to Array Size
// - Use Quotient & Remainder for batch calculation
// - For Loop with batch count iterations
// - Array Subset for batch extraction
// - Bundle: count, points, timestamp
// - Publish to "sensor.data.batch"
//
// ✅ Optimized: Message Queue Pattern
// 1. Create Queue Reference (outside loop)
// 2. Enqueue messages in producer loop
// 3. Dequeue and batch in consumer loop
// 4. Destroy Queue when done
//
// ✅ Optimized: Preallocate Arrays
// 1. Initialize Array with expected size
// 2. Replace Array Subset instead of Build Array
// 3. Avoid memory reallocation
//
// ✅ Optimized: Use Shift Registers
// 1. Store state in shift registers
// 2. Avoid repeated memory allocation
// 3. Maintain running calculations
//
// Performance Tips:
// - Avoid Build Array in loops
// - Use In Place Element structures
// - Enable parallelism where possible
// - Profile with Desktop Execution Trace
Message Serialization Optimization
- Binary vs JSON: Use binary formats for high-frequency data
- Schema Evolution: Design messages for backward compatibility
- Selective Fields: Only include necessary data in messages
- Compression: Compress large payloads (>1KB) before sending
Resource Management
Memory Optimization
public class HighFrequencyDataLogger : ModuleBase
{
// ✅ Object pooling for frequent allocations
private readonly ObjectPool<SensorReading> _readingPool;
private readonly Channel<SensorReading> _dataChannel;
public HighFrequencyDataLogger()
{
// Initialize object pool for sensor readings
_readingPool = new DefaultObjectPool<SensorReading>(
new DefaultPooledObjectPolicy<SensorReading>(), 100);
// Bounded channel to prevent memory growth
_dataChannel = Channel.CreateBounded<SensorReading>(
new BoundedChannelOptions(1000)
{
FullMode = BoundedChannelFullMode.Wait,
SingleReader = true,
SingleWriter = false
});
}
// ✅ Span-based processing to avoid allocations
public void ProcessSensorData(ReadOnlySpan<byte> rawData)
{
const int readingSize = 16; // 8 bytes timestamp + 8 bytes value
for (int i = 0; i < rawData.Length; i += readingSize)
{
var chunk = rawData.Slice(i, Math.Min(readingSize, rawData.Length - i));
var reading = ParseSensorReading(chunk);
LogReading(reading);
}
}
// ✅ ArrayPool for temporary buffers
public async Task ProcessSensorStreamAsync(Stream sensorStream)
{
var buffer = ArrayPool<byte>.Shared.Rent(4096);
try
{
int bytesRead;
while ((bytesRead = await sensorStream.ReadAsync(buffer, 0, buffer.Length)) > 0)
{
ProcessSensorData(buffer.AsSpan(0, bytesRead));
}
}
finally
{
ArrayPool<byte>.Shared.Return(buffer);
}
}
// ✅ Dispose pattern for resource cleanup
protected override void Dispose(bool disposing)
{
if (disposing)
{
_dataChannel?.Writer.TryComplete();
// Return all pooled objects
}
base.Dispose(disposing);
}
}
import gc
import array
import mmap
from collections import deque
from contextlib import contextmanager
class HighSpeedVisionModule(NexusModule):
def __init__(self):
super().__init__()
# ✅ Use deque for bounded frame buffer
self.frame_buffer = deque(maxlen=30) # 1 second at 30fps
# ✅ Preallocate arrays for image processing
self.pixel_buffer = array.array('B', [0] * (1920 * 1080 * 3)) # RGB
# ✅ Memory-mapped files for video recording
self.video_mmap = None
# ✅ Generator for memory-efficient iteration
def process_large_dataset(self, filename):
with open(filename, 'r') as f:
for line in f: # One line at a time
yield self.process_line(line)
# ✅ Context manager for resource management
@contextmanager
def large_buffer(self, size):
buffer = bytearray(size)
try:
yield buffer
finally:
del buffer
gc.collect() # Force garbage collection
# ✅ Memory-mapped file for large data
def process_with_mmap(self, filename):
with open(filename, 'r+b') as f:
with mmap.mmap(f.fileno(), 0) as mmapped:
# Process data without loading into memory
for i in range(0, len(mmapped), 4096):
chunk = mmapped[i:i+4096]
self.process_chunk(chunk)
# ✅ Slots for reduced memory overhead
class DataPoint:
__slots__ = ['timestamp', 'value', 'sensor_id']
def __init__(self, timestamp, value, sensor_id):
self.timestamp = timestamp
self.value = value
self.sensor_id = sensor_id
# ✅ Weak references for caches
import weakref
def __init__(self):
self._cache = weakref.WeakValueDictionary()
def cleanup(self):
# ✅ Explicit cleanup
self.data_buffer.clear()
if self.mmap_file:
self.mmap_file.close()
gc.collect()
class RobotArmController : public NexusModule {
private:
// ✅ Custom allocator for memory pooling
template<typename T>
class PoolAllocator {
std::vector<T> pool_;
std::queue<T*> available_;
public:
PoolAllocator(size_t size) {
pool_.reserve(size);
for (size_t i = 0; i < size; ++i) {
pool_.emplace_back();
available_.push(&pool_[i]);
}
}
T* allocate() {
if (available_.empty()) return nullptr;
T* obj = available_.front();
available_.pop();
return obj;
}
void deallocate(T* obj) {
available_.push(obj);
}
};
// ✅ Ring buffer for zero-allocation streaming
template<typename T, size_t Size>
class RingBuffer {
std::array<T, Size> buffer_;
std::atomic<size_t> write_idx_{0};
std::atomic<size_t> read_idx_{0};
public:
bool try_push(T&& item) {
size_t write = write_idx_.load();
size_t next = (write + 1) % Size;
if (next == read_idx_.load()) return false;
buffer_[write] = std::move(item);
write_idx_.store(next);
return true;
}
};
// ✅ Memory-mapped file handling
class MappedFile {
void* data_ = nullptr;
size_t size_ = 0;
int fd_ = -1;
public:
MappedFile(const std::string& filename) {
fd_ = open(filename.c_str(), O_RDWR);
struct stat sb;
fstat(fd_, &sb);
size_ = sb.st_size;
data_ = mmap(nullptr, size_, PROT_READ | PROT_WRITE,
MAP_SHARED, fd_, 0);
}
~MappedFile() {
if (data_) munmap(data_, size_);
if (fd_ >= 0) close(fd_);
}
std::span<uint8_t> data() {
return {static_cast<uint8_t*>(data_), size_};
}
};
public:
// ✅ Stack allocation for small objects
void process_data() {
char buffer[4096]; // Stack allocated
std::array<double, 100> values{}; // Stack allocated
// Process without heap allocation
process_buffer(std::span{buffer});
}
// ✅ Custom deleter for smart pointers
struct BufferDeleter {
PoolAllocator<uint8_t>* pool;
void operator()(uint8_t* ptr) {
// Return to pool instead of delete
if (pool) pool->deallocate(ptr);
}
};
using PooledBuffer = std::unique_ptr<uint8_t[], BufferDeleter>;
};
classdef SpectralAnalyzer < NexusModule
properties (Access = private)
% ✅ Preallocate data structures
dataBuffer
bufferSize = 10000
bufferIndex = 1
% ✅ Reuse temporary variables
tempArray
tempMatrix
end
methods
function obj = MemoryOptimizedModule()
% ✅ Preallocate all arrays
obj.dataBuffer = zeros(obj.bufferSize, 1);
obj.tempArray = zeros(1000, 1);
obj.tempMatrix = zeros(100, 100);
end
% ✅ Efficient memory usage patterns
function processLargeData(obj, dataFile)
% Use matfile for partial loading
m = matfile(dataFile);
info = whos(m);
% Process in chunks
chunkSize = 1000;
for i = 1:chunkSize:info.size(1)
endIdx = min(i + chunkSize - 1, info.size(1));
chunk = m.data(i:endIdx, :);
obj.processChunk(chunk);
end
end
% ✅ In-place operations
function data = normalizeInPlace(obj, data)
% Avoid creating copies
data = data - mean(data); % In-place subtraction
data = data ./ std(data); % In-place division
end
% ✅ Clear large variables explicitly
function cleanup(obj)
% Clear large arrays
obj.dataBuffer = [];
obj.tempMatrix = [];
% Force garbage collection
clear functions
end
% ✅ Use tall arrays for big data
function processBigData(obj, datastore)
% Create tall array
tt = tall(datastore);
% Operations are deferred
processed = obj.transform(tt);
% Compute in chunks
result = gather(processed);
end
% ✅ Memory-efficient data structures
function storeEfficiently(obj, data)
% Use appropriate data types
if all(data == floor(data)) && all(data >= 0) && all(data <= 255)
% Use uint8 for small integers
obj.dataBuffer = uint8(data);
elseif all(data >= -32768) && all(data <= 32767)
% Use int16 for medium integers
obj.dataBuffer = int16(data);
else
% Use single precision if possible
obj.dataBuffer = single(data);
end
end
end
end
// Memory Optimization in LabVIEW
//
// ✅ Preallocate Arrays
// 1. Initialize Array with final size
// 2. Use Replace Array Subset (not Build Array)
// 3. Avoid growing arrays in loops
//
// Implementation:
// - Initialize Array (size input wired)
// - For Loop with Replace Array Subset
// - Wire initialized array to shift register
//
// ✅ Use In Place Element Structure
// 1. Right-click on array/cluster
// 2. Select "In Place Element Structure"
// 3. Modify data without copies
//
// ✅ Data Value References (DVR)
// 1. Create DVR for large data structures
// 2. Use In Place Element with DVR
// 3. Prevents data copies across VIs
//
// ✅ Queue-based Data Passing
// 1. Obtain Queue Reference
// 2. Enqueue data by reference
// 3. Dequeue in consumer
// 4. Release Queue when done
//
// ✅ Limit Array Sizes
// Properties to set:
// - Array indicator: "Number of Visible Elements"
// - Graph: "History Length" property
// - Chart: "Chart History Length"
//
// ✅ Memory-Mapped Files
// 1. Use File I/O > Advanced > Memory Map
// 2. Map only needed portions
// 3. Unmap when done
//
// ✅ Request Deallocation
// 1. Use "Request Deallocation" function
// 2. Place after large data operations
// 3. Helps LabVIEW memory manager
//
// Best Practices:
// - Monitor with Profile > Performance and Memory
// - Use fixed-size data types when possible
// - Clear large arrays with empty constant
// - Avoid unnecessary data copies
CPU Optimization
- Parallel Processing: Utilize multiple cores for independent operations
- Avoid Blocking: Use async/await patterns consistently
- Batch Processing: Process multiple items in single operations
- Cache Line Optimization: Structure data for CPU cache efficiency
Configuration Tuning
Performance Configuration Options
# Module manifest performance settings
modules:
- name: "HighPerformanceModule"
type: "process"
executable: "modules/high-perf/module.dll"
# Performance-related configuration
configuration:
# Message bus settings
message_bus:
batch_size: 100 # Batch messages before sending
compression: true # Enable compression for large messages
compression_threshold: 1024 # Bytes
# Threading configuration
threading:
worker_threads: 4 # Number of worker threads
thread_priority: "high" # Thread priority level
cpu_affinity: [0, 1] # Pin to specific CPU cores
# Memory settings
memory:
initial_heap: "100MB" # Initial memory allocation
max_heap: "500MB" # Maximum memory limit
gc_interval: 60 # Garbage collection interval (seconds)
# Caching configuration
cache:
enabled: true
size: "50MB"
ttl: 300 # Time-to-live in seconds
eviction: "lru" # Least recently used
# I/O settings
io:
buffer_size: 65536 # I/O buffer size
async: true # Use async I/O
direct: false # Use direct I/O (bypass cache)
# Resource limits
resources:
cpu_limit: 2.0 # CPU cores limit
memory_limit: "1GB" # Memory limit
# Health check tuning
health_check:
interval: 30 # Seconds between checks
timeout: 5 # Health check timeout
failure_threshold: 3 # Failures before unhealthy
Runtime Performance Monitoring
public class ConveyorBeltController : ModuleBase
{
private readonly IMetrics _metrics;
private readonly ConcurrentDictionary<string, Stopwatch> _timers;
protected override void OnInitialized()
{
// Register performance counters for conveyor system
_metrics.CreateCounter("items_processed_total");
_metrics.CreateHistogram("item_processing_time_ms");
_metrics.CreateGauge("belt_speed_mps");
// Start monitoring
_ = Task.Run(MonitorPerformanceAsync);
}
// Performance measurement wrapper
public async Task<T> MeasureAsync<T>(string operation, Func<Task<T>> action)
{
using var timer = _metrics.StartTimer($"{operation}_duration");
try
{
return await action();
}
finally
{
_metrics.RecordHistogram("operation_duration_ms",
timer.ElapsedMilliseconds,
new[] { ("operation", operation) });
}
}
private async Task MonitorPerformanceAsync()
{
while (!_cancellationToken.IsCancellationRequested)
{
// Record memory usage
var process = Process.GetCurrentProcess();
_metrics.SetGauge("memory_usage_mb",
process.WorkingSet64 / (1024 * 1024));
// Record CPU usage
_metrics.SetGauge("cpu_usage_percent",
await GetCpuUsageAsync());
await Task.Delay(TimeSpan.FromSeconds(10));
}
}
}
import time
import psutil
import asyncio
from contextlib import contextmanager
from functools import wraps
class ChemicalReactorMonitor(NexusModule):
def __init__(self):
super().__init__()
self.metrics = {}
self.process = psutil.Process()
# Performance measurement decorator
def measure_performance(func):
@wraps(func)
async def wrapper(self, *args, **kwargs):
start_time = time.perf_counter()
start_memory = self.process.memory_info().rss
try:
result = await func(self, *args, **kwargs)
return result
finally:
duration = time.perf_counter() - start_time
memory_delta = self.process.memory_info().rss - start_memory
self.record_metric(f"{func.__name__}_duration_ms", duration * 1000)
self.record_metric(f"{func.__name__}_memory_delta_bytes", memory_delta)
return wrapper
# Context manager for performance tracking
@contextmanager
def track_performance(self, operation_name):
start = time.perf_counter()
start_cpu = self.process.cpu_percent()
yield
duration = time.perf_counter() - start
cpu_usage = self.process.cpu_percent() - start_cpu
self.metrics[operation_name] = {
'duration_ms': duration * 1000,
'cpu_percent': cpu_usage
}
async def monitor_performance(self):
while self.running:
# System metrics
self.record_metric('memory_usage_mb',
self.process.memory_info().rss / 1024 / 1024)
self.record_metric('cpu_percent',
self.process.cpu_percent(interval=1))
self.record_metric('num_threads',
self.process.num_threads())
# Publish metrics
await self.context.publish('metrics.performance', self.metrics)
await asyncio.sleep(10)
Common Performance Pitfalls
⚠️ Avoid These Common Issues
- Synchronous I/O in Message Handlers: Always use async operations
- Unbounded Queues: Set limits to prevent memory exhaustion
- Large Message Payloads: Keep messages under 64KB when possible
- Frequent Small Messages: Batch when throughput is more important than latency
- Memory Leaks: Properly dispose resources and unsubscribe from events
- Blocking Health Checks: Keep health checks fast and non-blocking
Performance Troubleshooting Checklist
- ✓ Enable performance metrics in module manifest
- ✓ Monitor CPU and memory usage trends
- ✓ Check message bus latency metrics
- ✓ Profile hotspots in your code
- ✓ Review message sizes and frequencies
- ✓ Verify resource disposal in lifecycle methods
- ✓ Test under expected load conditions
- ✓ Implement circuit breakers for external calls
Testing NEXUS-1 Modules
Thorough testing ensures your modules are reliable, performant, and integrate seamlessly with the NEXUS-1 system. This section covers unit testing, integration testing, and testing best practices.
Testing Overview
Testing Strategy
- Unit Tests: Test module logic in isolation
- Integration Tests: Test module interaction with NEXUS-1
- Message Bus Tests: Verify communication patterns
- Lifecycle Tests: Ensure proper state transitions
- Performance Tests: Validate performance requirements
- Error Scenario Tests: Test failure handling
Testing Tools by Language
Language | Unit Testing | Mocking | Integration |
---|---|---|---|
C# | xUnit, NUnit, MSTest | Moq, NSubstitute | TestServer, Docker |
Python | pytest, unittest | unittest.mock, pytest-mock | pytest-asyncio, Docker |
C++ | Google Test, Catch2 | Google Mock, FakeIt | CTest, Docker |
MATLAB | MATLAB Unit Test | Mock Framework | System Test |
LabVIEW | VI Tester, UTF | Mock VIs | TestStand |
Unit Testing
Testing Module Logic
using Xunit;
using Moq;
using System.Threading.Tasks;
public class TemperatureModuleTests
{
private readonly Mock<IModuleContext> _contextMock;
private readonly TemperatureModule _module;
public TemperatureModuleTests()
{
_contextMock = new Mock<IModuleContext>();
_module = new TemperatureModule();
_module.Initialize(_contextMock.Object);
}
[Fact]
public async Task ProcessTemperature_ShouldPublishAlert_WhenThresholdExceeded()
{
// Arrange
var temperature = 85.5;
var threshold = 80.0;
_module.SetThreshold(threshold);
// Act
await _module.ProcessTemperature(temperature);
// Assert
_contextMock.Verify(x => x.PublishAsync(
It.Is<string>(topic => topic == "temperature.alert"),
It.Is<object>(msg =>
((dynamic)msg).Temperature == temperature &&
((dynamic)msg).Threshold == threshold
)
), Times.Once);
}
[Theory]
[InlineData(25.0, HealthStatus.Healthy)]
[InlineData(85.0, HealthStatus.Degraded)]
[InlineData(95.0, HealthStatus.Unhealthy)]
public async Task GetHealthAsync_ShouldReturnCorrectStatus(
double temperature, HealthStatus expected)
{
// Arrange
_module.SetCurrentTemperature(temperature);
// Act
var health = await _module.GetHealthAsync();
// Assert
Assert.Equal(expected, health.Status);
}
[Fact]
public void OnInitialized_ShouldSubscribeToCommands()
{
// Act
_module.OnInitialized();
// Assert
_contextMock.Verify(x => x.SubscribeAsync(
"temperature.commands.*",
It.IsAny<Func<Message, Task>>()
), Times.Once);
}
}
// Mock implementation for testing
public class MockMessageBus : IMessageBus
{
private readonly Dictionary<string, List<Func<Message, Task>>> _handlers = new();
public List<(string Topic, object Payload)> PublishedMessages { get; } = new();
public Task PublishAsync(string topic, object payload)
{
PublishedMessages.Add((topic, payload));
// Simulate message delivery
foreach (var kvp in _handlers)
{
if (MatchesTopic(kvp.Key, topic))
{
var message = new Message { Topic = topic, Payload = payload };
foreach (var handler in kvp.Value)
{
_ = Task.Run(() => handler(message));
}
}
}
return Task.CompletedTask;
}
public Task SubscribeAsync(string pattern, Func<Message, Task> handler)
{
if (!_handlers.ContainsKey(pattern))
_handlers[pattern] = new List<Func<Message, Task>>();
_handlers[pattern].Add(handler);
return Task.CompletedTask;
}
private bool MatchesTopic(string pattern, string topic)
{
// Simple wildcard matching
var regex = "^" + pattern.Replace("*", "[^.]+").Replace("**", ".*") + "$";
return System.Text.RegularExpressions.Regex.IsMatch(topic, regex);
}
}
import pytest
import asyncio
from unittest.mock import Mock, AsyncMock, patch
from datetime import datetime
class TestTemperatureModule:
@pytest.fixture
def mock_context(self):
context = Mock()
context.publish = AsyncMock()
context.subscribe = AsyncMock()
context.request = AsyncMock()
return context
@pytest.fixture
def temperature_module(self, mock_context):
from temperature_module import TemperatureModule
module = TemperatureModule()
module.initialize(mock_context)
return module
@pytest.mark.asyncio
async def test_process_temperature_publishes_alert_on_threshold(
self, temperature_module, mock_context):
# Arrange
temperature = 85.5
threshold = 80.0
temperature_module.set_threshold(threshold)
# Act
await temperature_module.process_temperature(temperature)
# Assert
mock_context.publish.assert_called_once_with(
"temperature.alert",
{
"temperature": temperature,
"threshold": threshold,
"timestamp": pytest.approx(datetime.utcnow(), abs=1)
}
)
@pytest.mark.parametrize("temperature,expected_status", [
(25.0, "healthy"),
(85.0, "degraded"),
(95.0, "unhealthy")
])
@pytest.mark.asyncio
async def test_health_check_returns_correct_status(
self, temperature_module, temperature, expected_status):
# Arrange
temperature_module.current_temperature = temperature
# Act
health = await temperature_module.get_health()
# Assert
assert health["status"] == expected_status
assert health["temperature"] == temperature
def test_on_initialized_subscribes_to_commands(
self, temperature_module, mock_context):
# Act
temperature_module.on_initialized()
# Assert
mock_context.subscribe.assert_called_with(
"temperature.commands.*",
temperature_module._handle_command
)
@pytest.mark.asyncio
async def test_handle_command_set_threshold(
self, temperature_module, mock_context):
# Arrange
message = Mock()
message.payload = {"command": "set_threshold", "value": 75.0}
# Act
await temperature_module._handle_command(message)
# Assert
assert temperature_module.threshold == 75.0
mock_context.publish.assert_called_with(
"temperature.threshold_changed",
{"old": 80.0, "new": 75.0}
)
# Mock Message Bus for testing
class MockMessageBus:
def __init__(self):
self.published = []
self.handlers = {}
async def publish(self, topic, payload):
self.published.append((topic, payload))
# Simulate message delivery
for pattern, handler in self.handlers.items():
if self._matches_pattern(pattern, topic):
message = Message(topic=topic, payload=payload)
await handler(message)
async def subscribe(self, pattern, handler):
self.handlers[pattern] = handler
def _matches_pattern(self, pattern, topic):
import re
regex = pattern.replace(".", r"\.").replace("*", "[^.]+").replace("**", ".*")
return re.match(f"^{regex}$", topic) is not None
# Integration test example
@pytest.mark.integration
class TestTemperatureModuleIntegration:
@pytest.fixture
async def nexus_test_env(self):
"""Creates a test environment with message bus"""
from nexus_test import TestEnvironment
env = TestEnvironment()
await env.start()
yield env
await env.stop()
@pytest.mark.asyncio
async def test_module_lifecycle(self, nexus_test_env):
# Load module
module = await nexus_test_env.load_module("temperature_module")
# Verify initialization
assert module.state == "running"
# Send test message
await nexus_test_env.publish("sensor.temperature", {"value": 75.0})
# Wait for processing
await asyncio.sleep(0.1)
# Verify response
messages = nexus_test_env.get_published_messages("temperature.processed")
assert len(messages) == 1
assert messages[0]["value"] == 75.0
#include <gtest/gtest.h>
#include <gmock/gmock.h>
#include "temperature_module.h"
using ::testing::_;
using ::testing::Return;
using ::testing::Invoke;
// Mock context for testing
class MockModuleContext : public IModuleContext {
public:
MOCK_METHOD(void, publish, (const std::string& topic, const json& payload), (override));
MOCK_METHOD(void, subscribe, (const std::string& pattern, MessageHandler handler), (override));
MOCK_METHOD(json, request, (const std::string& topic, const json& request, int timeout), (override));
};
class TemperatureModuleTest : public ::testing::Test {
protected:
std::unique_ptr<MockModuleContext> mock_context;
std::unique_ptr<TemperatureModule> module;
void SetUp() override {
mock_context = std::make_unique<MockModuleContext>();
module = std::make_unique<TemperatureModule>();
module->initialize(mock_context.get());
}
};
TEST_F(TemperatureModuleTest, ProcessTemperature_PublishesAlert_WhenThresholdExceeded) {
// Arrange
double temperature = 85.5;
double threshold = 80.0;
module->set_threshold(threshold);
json expected_payload = {
{"temperature", temperature},
{"threshold", threshold},
{"severity", "warning"}
};
EXPECT_CALL(*mock_context, publish("temperature.alert", expected_payload))
.Times(1);
// Act
module->process_temperature(temperature);
}
TEST_F(TemperatureModuleTest, GetHealth_ReturnsCorrectStatus) {
// Test healthy state
module->set_current_temperature(25.0);
auto health = module->get_health();
EXPECT_EQ(health.status, HealthStatus::Healthy);
// Test degraded state
module->set_current_temperature(85.0);
health = module->get_health();
EXPECT_EQ(health.status, HealthStatus::Degraded);
// Test unhealthy state
module->set_current_temperature(95.0);
health = module->get_health();
EXPECT_EQ(health.status, HealthStatus::Unhealthy);
}
TEST_F(TemperatureModuleTest, OnInitialized_SubscribesToCommands) {
// Expect subscription call
EXPECT_CALL(*mock_context, subscribe("temperature.commands.*", _))
.Times(1);
// Act
module->on_initialized();
}
// Test message handler
TEST_F(TemperatureModuleTest, HandleCommand_SetsThreshold) {
// Arrange
MessageHandler handler;
EXPECT_CALL(*mock_context, subscribe(_, _))
.WillOnce(Invoke([&handler](const std::string&, MessageHandler h) {
handler = h;
}));
module->on_initialized();
Message command;
command.topic = "temperature.commands.set_threshold";
command.payload = {{"value", 75.0}};
EXPECT_CALL(*mock_context, publish("temperature.threshold_changed", _))
.Times(1);
// Act
handler(command);
// Assert
EXPECT_EQ(module->get_threshold(), 75.0);
}
// Mock Message Bus for integration testing
class MockMessageBus {
private:
std::unordered_map<std::string, std::vector<MessageHandler>> handlers_;
std::vector<std::pair<std::string, json>> published_;
std::mutex mutex_;
public:
void publish(const std::string& topic, const json& payload) {
std::lock_guard<std::mutex> lock(mutex_);
published_.emplace_back(topic, payload);
// Deliver to matching handlers
for (const auto& [pattern, handler_list] : handlers_) {
if (matches_pattern(pattern, topic)) {
Message msg{topic, payload};
for (const auto& handler : handler_list) {
handler(msg);
}
}
}
}
void subscribe(const std::string& pattern, MessageHandler handler) {
std::lock_guard<std::mutex> lock(mutex_);
handlers_[pattern].push_back(handler);
}
std::vector<json> get_published(const std::string& topic) const {
std::lock_guard<std::mutex> lock(mutex_);
std::vector<json> result;
for (const auto& [t, payload] : published_) {
if (t == topic) {
result.push_back(payload);
}
}
return result;
}
private:
bool matches_pattern(const std::string& pattern, const std::string& topic) const {
// Simple wildcard matching implementation
std::regex regex(std::regex_replace(
std::regex_replace(pattern, std::regex("\\."), "\\."),
std::regex("\\*"), "[^.]+"));
return std::regex_match(topic, regex);
}
};
classdef TemperatureModuleTest < matlab.unittest.TestCase
% Unit tests for Temperature Module
properties
Module
MockContext
end
methods (TestMethodSetup)
function setupTest(testCase)
% Create mock context
testCase.MockContext = MockModuleContext();
% Create module instance
testCase.Module = TemperatureModule();
testCase.Module.initialize(testCase.MockContext);
end
end
methods (Test)
function testProcessTemperature_PublishesAlert_WhenThresholdExceeded(testCase)
% Arrange
temperature = 85.5;
threshold = 80.0;
testCase.Module.setThreshold(threshold);
% Act
testCase.Module.processTemperature(temperature);
% Assert
testCase.verifyEqual(testCase.MockContext.PublishCalls{end}.Topic, ...
'temperature.alert');
payload = testCase.MockContext.PublishCalls{end}.Payload;
testCase.verifyEqual(payload.temperature, temperature);
testCase.verifyEqual(payload.threshold, threshold);
end
function testGetHealth_ReturnsCorrectStatus(testCase)
% Test data: [temperature, expected_status]
testData = {
25.0, 'healthy';
85.0, 'degraded';
95.0, 'unhealthy'
};
for i = 1:size(testData, 1)
temperature = testData{i, 1};
expectedStatus = testData{i, 2};
% Arrange
testCase.Module.CurrentTemperature = temperature;
% Act
health = testCase.Module.getHealth();
% Assert
testCase.verifyEqual(health.Status, expectedStatus, ...
sprintf('Temperature %.1f should result in %s status', ...
temperature, expectedStatus));
end
end
function testOnInitialized_SubscribesToCommands(testCase)
% Act
testCase.Module.onInitialized();
% Assert
subscriptions = testCase.MockContext.SubscribeCalls;
testCase.verifyTrue(any(strcmp({subscriptions.Pattern}, ...
'temperature.commands.*')));
end
function testHandleCommand_SetsThreshold(testCase)
% Arrange
testCase.Module.onInitialized();
handler = testCase.MockContext.getHandler('temperature.commands.*');
message = struct(...
'Topic', 'temperature.commands.set_threshold', ...
'Payload', struct('value', 75.0));
% Act
handler(message);
% Assert
testCase.verifyEqual(testCase.Module.Threshold, 75.0);
% Verify notification published
publishCalls = testCase.MockContext.PublishCalls;
lastCall = publishCalls{end};
testCase.verifyEqual(lastCall.Topic, 'temperature.threshold_changed');
end
end
methods (Test, TestTags = {'Integration'})
function testModuleLifecycle(testCase)
% Create test environment
env = NexusTestEnvironment();
env.start();
cleanup = onCleanup(@() env.stop());
% Load module
moduleId = env.loadModule('temperature_module.yaml');
% Verify module state
state = env.getModuleState(moduleId);
testCase.verifyEqual(state, 'running');
% Send test message
env.publish('sensor.temperature', struct('value', 75.0));
% Wait for processing
pause(0.1);
% Verify response
messages = env.getPublishedMessages('temperature.processed');
testCase.verifyEqual(length(messages), 1);
testCase.verifyEqual(messages{1}.value, 75.0);
end
end
end
% Mock implementation
classdef MockModuleContext < handle
properties
PublishCalls = {}
SubscribeCalls = {}
Handlers = containers.Map()
end
methods
function publish(obj, topic, payload)
obj.PublishCalls{end+1} = struct(...
'Topic', topic, ...
'Payload', payload, ...
'Timestamp', datetime('now'));
% Simulate message delivery
keys = obj.Handlers.keys;
for i = 1:length(keys)
pattern = keys{i};
if obj.matchesPattern(pattern, topic)
handler = obj.Handlers(pattern);
message = struct('Topic', topic, 'Payload', payload);
handler(message);
end
end
end
function subscribe(obj, pattern, handler)
obj.SubscribeCalls{end+1} = struct(...
'Pattern', pattern, ...
'Handler', handler);
obj.Handlers(pattern) = handler;
end
function handler = getHandler(obj, pattern)
handler = obj.Handlers(pattern);
end
function matches = matchesPattern(~, pattern, topic)
% Simple wildcard matching
regexPattern = strrep(pattern, '.', '\.');
regexPattern = strrep(regexPattern, '*', '[^.]+');
regexPattern = ['^' regexPattern '$'];
matches = ~isempty(regexp(topic, regexPattern, 'once'));
end
end
end
// Unit Testing in LabVIEW using VI Tester
//
// Test Structure:
// 1. Create Test Class inheriting from VITester
// 2. Override Setup.vi and Teardown.vi
// 3. Create test methods (VIs starting with "test")
//
// Example: TemperatureModuleTests.lvclass
//
// === Setup.vi ===
// 1. Create Mock Context (DVR)
// 2. Initialize Temperature Module
// 3. Store references in class private data
//
// === test_ProcessTemperature_AlertOnThreshold.vi ===
// Arrange:
// - Set threshold = 80.0
// - Create temperature = 85.5
//
// Act:
// - Call ProcessTemperature.vi with mock context
//
// Assert:
// - Use "Assert Equal" VI
// - Verify publish called with "temperature.alert"
// - Check payload contains correct values
//
// === test_GetHealth_ReturnsCorrectStatus.vi ===
// Test Cases (using For Loop):
// - 25.0 → Healthy
// - 85.0 → Degraded
// - 95.0 → Unhealthy
//
// For each case:
// - Set module temperature
// - Call GetHealth.vi
// - Assert Equal on status
//
// === Mock Context Implementation ===
// MockContext.lvclass with:
//
// Data:
// - PublishCalls (Array of clusters)
// - SubscribeCalls (Array of clusters)
// - Handlers (Variant Attribute)
//
// Methods:
// - Publish.vi: Add to PublishCalls array
// - Subscribe.vi: Store handler reference
// - GetPublishedMessages.vi: Filter by topic
//
// === Integration Testing ===
// Use TestStand or custom framework:
//
// 1. Start NEXUS Test Environment
// 2. Load module configuration
// 3. Send test messages
// 4. Verify responses
// 5. Clean up resources
//
// === Best Practices ===
// - Use DVRs for mock objects
// - Implement disposable pattern
// - Group related tests in classes
// - Use parameterized tests where possible
// - Maintain test isolation
Integration Testing
Testing with NEXUS-1 Test Framework
The NEXUS-1 SDK provides a test framework for integration testing that simulates the runtime environment.
using Nexus.Testing;
using Xunit;
using Microsoft.Extensions.DependencyInjection;
public class TemperatureModuleIntegrationTests : IAsyncLifetime
{
private NexusTestHost _testHost;
private IModuleTestClient _moduleClient;
public async Task InitializeAsync()
{
// Create test host
_testHost = new NexusTestHost(builder =>
{
builder.ConfigureServices(services =>
{
// Add test services
services.AddSingleton<ITimeProvider, MockTimeProvider>();
});
builder.ConfigureModules(modules =>
{
// Load module under test
modules.AddModule<TemperatureModule>("temperature-monitor");
// Add mock dependencies
modules.AddMockModule("database-service", mock =>
{
mock.SetupRequest("db.query", req => new { result = "test" });
});
});
});
await _testHost.StartAsync();
_moduleClient = _testHost.GetModuleClient("temperature-monitor");
}
public async Task DisposeAsync()
{
await _testHost.StopAsync();
_testHost.Dispose();
}
[Fact]
public async Task Module_ProcessesTemperatureData_EndToEnd()
{
// Arrange
var testData = new { value = 75.5, sensor = "sensor-1" };
// Act - Send temperature data
await _testHost.PublishAsync("sensor.temperature", testData);
// Wait for processing
await _testHost.WaitForMessageAsync("temperature.processed",
TimeSpan.FromSeconds(5));
// Assert - Verify processed message
var messages = _testHost.GetPublishedMessages("temperature.processed");
Assert.Single(messages);
dynamic processed = messages[0];
Assert.Equal(75.5, (double)processed.value);
Assert.Equal("celsius", (string)processed.unit);
}
[Fact]
public async Task Module_HandlesHighTemperature_PublishesAlert()
{
// Arrange
await _moduleClient.SendCommandAsync("set_threshold", new { value = 80.0 });
// Act
await _testHost.PublishAsync("sensor.temperature",
new { value = 85.5, sensor = "sensor-1" });
// Assert
await _testHost.AssertMessagePublishedAsync("temperature.alert",
msg => ((dynamic)msg).severity == "warning");
}
[Fact]
public async Task Module_HealthCheck_ReflectsState()
{
// Initial health should be healthy
var health = await _moduleClient.GetHealthAsync();
Assert.Equal(HealthStatus.Healthy, health.Status);
// Simulate sensor failure
await _testHost.PublishAsync("sensor.failed",
new { sensor = "sensor-1" });
// Health should degrade
health = await _moduleClient.GetHealthAsync();
Assert.Equal(HealthStatus.Degraded, health.Status);
Assert.Contains("Sensor failure", health.Message);
}
[Fact]
public async Task Module_MessagePatterns_WorkCorrectly()
{
// Test wildcard subscriptions
var received = new List<string>();
await _testHost.SubscribeAsync("temperature.*", msg =>
{
received.Add(msg.Topic);
return Task.CompletedTask;
});
// Publish various topics
await _testHost.PublishAsync("temperature.reading", new { });
await _testHost.PublishAsync("temperature.alert", new { });
await _testHost.PublishAsync("humidity.reading", new { }); // Should not match
await Task.Delay(100); // Wait for delivery
Assert.Equal(2, received.Count);
Assert.Contains("temperature.reading", received);
Assert.Contains("temperature.alert", received);
}
}
// Performance testing
[Collection("Performance")]
public class TemperatureModulePerformanceTests
{
[Fact]
public async Task Module_HandlesHighLoad()
{
using var testHost = new NexusTestHost();
await testHost.StartAsync();
var metrics = testHost.EnableMetrics();
// Send 1000 messages
var tasks = Enumerable.Range(0, 1000)
.Select(i => testHost.PublishAsync("sensor.temperature",
new { value = 20.0 + i % 10, sensor = $"sensor-{i % 5}" }))
.ToArray();
await Task.WhenAll(tasks);
// Wait for processing
await Task.Delay(TimeSpan.FromSeconds(5));
// Verify performance metrics
Assert.True(metrics.AverageLatency < 10, "Latency should be under 10ms");
Assert.True(metrics.MessagesPerSecond > 100, "Should process >100 msg/sec");
Assert.Equal(1000, metrics.TotalProcessed);
}
}
import pytest
import asyncio
from nexus.testing import NexusTestHost, ModuleTestClient
from datetime import datetime, timedelta
class TestTemperatureModuleIntegration:
@pytest.fixture
async def test_host(self):
"""Create and start test host"""
host = NexusTestHost()
# Configure test environment
host.configure_modules({
"temperature-monitor": {
"module": "temperature_module.TemperatureModule",
"config": {"threshold": 80.0}
}
})
# Add mock modules
host.add_mock_module("database-service", {
"db.query": lambda req: {"result": "test"}
})
await host.start()
yield host
await host.stop()
@pytest.fixture
def module_client(self, test_host):
"""Get client for module under test"""
return test_host.get_module_client("temperature-monitor")
@pytest.mark.asyncio
async def test_module_processes_temperature_data(self, test_host):
# Arrange
test_data = {"value": 75.5, "sensor": "sensor-1"}
# Act - Send temperature data
await test_host.publish("sensor.temperature", test_data)
# Wait for processing
message = await test_host.wait_for_message(
"temperature.processed",
timeout=5.0
)
# Assert
assert message is not None
assert message["value"] == 75.5
assert message["unit"] == "celsius"
assert "timestamp" in message
@pytest.mark.asyncio
async def test_module_handles_high_temperature(self, test_host, module_client):
# Arrange - Set threshold
await module_client.send_command("set_threshold", {"value": 80.0})
# Act - Send high temperature
await test_host.publish("sensor.temperature", {
"value": 85.5,
"sensor": "sensor-1"
})
# Assert - Verify alert
alert = await test_host.wait_for_message(
"temperature.alert",
timeout=2.0
)
assert alert is not None
assert alert["severity"] == "warning"
assert alert["temperature"] == 85.5
assert alert["threshold"] == 80.0
@pytest.mark.asyncio
async def test_module_health_reflects_state(self, test_host, module_client):
# Initial health should be healthy
health = await module_client.get_health()
assert health["status"] == "healthy"
# Simulate sensor failure
await test_host.publish("sensor.failed", {"sensor": "sensor-1"})
# Wait for state update
await asyncio.sleep(0.1)
# Health should degrade
health = await module_client.get_health()
assert health["status"] == "degraded"
assert "sensor failure" in health["message"].lower()
@pytest.mark.asyncio
async def test_message_patterns_work_correctly(self, test_host):
# Test wildcard subscriptions
received = []
async def handler(msg):
received.append(msg.topic)
await test_host.subscribe("temperature.*", handler)
# Publish various topics
await test_host.publish("temperature.reading", {})
await test_host.publish("temperature.alert", {})
await test_host.publish("humidity.reading", {}) # Should not match
await asyncio.sleep(0.1) # Wait for delivery
assert len(received) == 2
assert "temperature.reading" in received
assert "temperature.alert" in received
@pytest.mark.asyncio
async def test_request_response_pattern(self, test_host):
# Test synchronous communication
response = await test_host.request(
"temperature.query",
{"type": "current"}
)
assert response is not None
assert "value" in response
assert "timestamp" in response
# Performance testing
@pytest.mark.performance
class TestTemperatureModulePerformance:
@pytest.mark.asyncio
async def test_module_handles_high_load(self, test_host):
# Enable metrics
metrics = test_host.enable_metrics()
# Send 1000 messages
tasks = []
for i in range(1000):
task = test_host.publish("sensor.temperature", {
"value": 20.0 + i % 10,
"sensor": f"sensor-{i % 5}"
})
tasks.append(task)
await asyncio.gather(*tasks)
# Wait for processing
await asyncio.sleep(5)
# Verify performance
assert metrics.average_latency < 10 # ms
assert metrics.messages_per_second > 100
assert metrics.total_processed == 1000
@pytest.mark.asyncio
async def test_memory_usage_stable(self, test_host):
import psutil
process = psutil.Process()
initial_memory = process.memory_info().rss / 1024 / 1024 # MB
# Process messages for 30 seconds
start_time = asyncio.get_event_loop().time()
while asyncio.get_event_loop().time() - start_time < 30:
await test_host.publish("sensor.temperature", {
"value": 25.0,
"sensor": "test"
})
await asyncio.sleep(0.01) # 100 msg/sec
final_memory = process.memory_info().rss / 1024 / 1024 # MB
memory_increase = final_memory - initial_memory
# Memory should not increase significantly
assert memory_increase < 50 # MB
#include <nexus/testing.h>
#include <gtest/gtest.h>
#include <chrono>
#include <future>
class TemperatureModuleIntegrationTest : public ::testing::Test {
protected:
std::unique_ptr<NexusTestHost> test_host;
std::unique_ptr<ModuleTestClient> module_client;
void SetUp() override {
// Create test host
test_host = std::make_unique<NexusTestHost>();
// Configure modules
test_host->configure_modules({
{"temperature-monitor", {
{"type", "temperature_module"},
{"config", {
{"threshold", 80.0}
}}
}}
});
// Add mock modules
test_host->add_mock_module("database-service",
[](const std::string& topic, const json& request) {
if (topic == "db.query") {
return json{{"result", "test"}};
}
return json{};
});
// Start test host
test_host->start();
// Get module client
module_client = test_host->get_module_client("temperature-monitor");
}
void TearDown() override {
test_host->stop();
}
};
TEST_F(TemperatureModuleIntegrationTest, ProcessesTemperatureData) {
// Arrange
json test_data = {
{"value", 75.5},
{"sensor", "sensor-1"}
};
// Set up message capture
std::promise<json> message_promise;
auto message_future = message_promise.get_future();
test_host->subscribe("temperature.processed",
[&message_promise](const Message& msg) {
message_promise.set_value(msg.payload);
});
// Act - Send temperature data
test_host->publish("sensor.temperature", test_data);
// Wait for processing
auto status = message_future.wait_for(std::chrono::seconds(5));
ASSERT_EQ(status, std::future_status::ready);
// Assert
auto processed = message_future.get();
EXPECT_EQ(processed["value"], 75.5);
EXPECT_EQ(processed["unit"], "celsius");
EXPECT_TRUE(processed.contains("timestamp"));
}
TEST_F(TemperatureModuleIntegrationTest, HandlesHighTemperature) {
// Arrange - Set threshold
module_client->send_command("set_threshold", {{"value", 80.0}});
// Capture alert
std::promise<json> alert_promise;
auto alert_future = alert_promise.get_future();
test_host->subscribe("temperature.alert",
[&alert_promise](const Message& msg) {
alert_promise.set_value(msg.payload);
});
// Act - Send high temperature
test_host->publish("sensor.temperature", {
{"value", 85.5},
{"sensor", "sensor-1"}
});
// Wait for alert
auto status = alert_future.wait_for(std::chrono::seconds(2));
ASSERT_EQ(status, std::future_status::ready);
// Assert
auto alert = alert_future.get();
EXPECT_EQ(alert["severity"], "warning");
EXPECT_EQ(alert["temperature"], 85.5);
EXPECT_EQ(alert["threshold"], 80.0);
}
TEST_F(TemperatureModuleIntegrationTest, HealthReflectsState) {
// Initial health should be healthy
auto health = module_client->get_health();
EXPECT_EQ(health.status, HealthStatus::Healthy);
// Simulate sensor failure
test_host->publish("sensor.failed", {{"sensor", "sensor-1"}});
// Wait for state update
std::this_thread::sleep_for(std::chrono::milliseconds(100));
// Health should degrade
health = module_client->get_health();
EXPECT_EQ(health.status, HealthStatus::Degraded);
EXPECT_TRUE(health.message.find("Sensor failure") != std::string::npos);
}
TEST_F(TemperatureModuleIntegrationTest, MessagePatternsWork) {
// Test wildcard subscriptions
std::vector<std::string> received;
std::mutex received_mutex;
test_host->subscribe("temperature.*",
[&received, &received_mutex](const Message& msg) {
std::lock_guard<std::mutex> lock(received_mutex);
received.push_back(msg.topic);
});
// Publish various topics
test_host->publish("temperature.reading", {});
test_host->publish("temperature.alert", {});
test_host->publish("humidity.reading", {}); // Should not match
// Wait for delivery
std::this_thread::sleep_for(std::chrono::milliseconds(100));
// Verify
EXPECT_EQ(received.size(), 2);
EXPECT_TRUE(std::find(received.begin(), received.end(),
"temperature.reading") != received.end());
EXPECT_TRUE(std::find(received.begin(), received.end(),
"temperature.alert") != received.end());
}
// Performance testing
class TemperatureModulePerformanceTest : public ::testing::Test {
protected:
std::unique_ptr<NexusTestHost> test_host;
void SetUp() override {
test_host = std::make_unique<NexusTestHost>();
test_host->start();
}
};
TEST_F(TemperatureModulePerformanceTest, HandlesHighLoad) {
auto metrics = test_host->enable_metrics();
// Send 1000 messages
std::vector<std::future<void>> futures;
for (int i = 0; i < 1000; ++i) {
futures.push_back(
std::async(std::launch::async, [this, i]() {
test_host->publish("sensor.temperature", {
{"value", 20.0 + i % 10},
{"sensor", "sensor-" + std::to_string(i % 5)}
});
})
);
}
// Wait for all sends
for (auto& f : futures) {
f.wait();
}
// Wait for processing
std::this_thread::sleep_for(std::chrono::seconds(5));
// Verify performance
EXPECT_LT(metrics->average_latency_ms(), 10);
EXPECT_GT(metrics->messages_per_second(), 100);
EXPECT_EQ(metrics->total_processed(), 1000);
}
classdef TemperatureModuleIntegrationTest < matlab.unittest.TestCase
% Integration tests for Temperature Module
properties
TestHost
ModuleClient
end
methods (TestMethodSetup)
function setupTest(testCase)
% Create test host
testCase.TestHost = NexusTestHost();
% Configure modules
config = struct(...
'modules', struct(...
'temperature_monitor', struct(...
'type', 'temperature_module', ...
'config', struct('threshold', 80.0))));
testCase.TestHost.configure(config);
% Add mock modules
testCase.TestHost.addMockModule('database-service', ...
@(topic, request) struct('result', 'test'));
% Start test host
testCase.TestHost.start();
% Get module client
testCase.ModuleClient = testCase.TestHost.getModuleClient(...
'temperature-monitor');
end
function teardownTest(testCase)
testCase.TestHost.stop();
end
end
methods (Test)
function testProcessesTemperatureData(testCase)
% Arrange
testData = struct('value', 75.5, 'sensor', 'sensor-1');
messageReceived = false;
% Subscribe to processed messages
testCase.TestHost.subscribe('temperature.processed', ...
@(msg) testCase.onMessageReceived(msg));
% Act - Send temperature data
testCase.TestHost.publish('sensor.temperature', testData);
% Wait for processing
message = testCase.TestHost.waitForMessage(...
'temperature.processed', 5.0);
% Assert
testCase.verifyNotEmpty(message);
testCase.verifyEqual(message.value, 75.5);
testCase.verifyEqual(message.unit, 'celsius');
testCase.verifyTrue(isfield(message, 'timestamp'));
end
function testHandlesHighTemperature(testCase)
% Arrange - Set threshold
testCase.ModuleClient.sendCommand('set_threshold', ...
struct('value', 80.0));
% Act - Send high temperature
testCase.TestHost.publish('sensor.temperature', ...
struct('value', 85.5, 'sensor', 'sensor-1'));
% Wait for alert
alert = testCase.TestHost.waitForMessage(...
'temperature.alert', 2.0);
% Assert
testCase.verifyNotEmpty(alert);
testCase.verifyEqual(alert.severity, 'warning');
testCase.verifyEqual(alert.temperature, 85.5);
testCase.verifyEqual(alert.threshold, 80.0);
end
function testHealthReflectsState(testCase)
% Initial health should be healthy
health = testCase.ModuleClient.getHealth();
testCase.verifyEqual(health.status, 'healthy');
% Simulate sensor failure
testCase.TestHost.publish('sensor.failed', ...
struct('sensor', 'sensor-1'));
% Wait for state update
pause(0.1);
% Health should degrade
health = testCase.ModuleClient.getHealth();
testCase.verifyEqual(health.status, 'degraded');
testCase.verifySubstring(health.message, 'Sensor failure');
end
function testMessagePatternsWork(testCase)
% Test wildcard subscriptions
received = {};
testCase.TestHost.subscribe('temperature.*', ...
@(msg) testCase.collectMessage(msg));
% Publish various topics
testCase.TestHost.publish('temperature.reading', struct());
testCase.TestHost.publish('temperature.alert', struct());
testCase.TestHost.publish('humidity.reading', struct());
% Wait for delivery
pause(0.1);
% Get collected messages
messages = testCase.TestHost.getCollectedMessages();
topics = cellfun(@(m) m.topic, messages, 'UniformOutput', false);
% Verify
testCase.verifyEqual(length(topics), 2);
testCase.verifyTrue(any(strcmp(topics, 'temperature.reading')));
testCase.verifyTrue(any(strcmp(topics, 'temperature.alert')));
end
end
methods (Test, TestTags = {'Performance'})
function testHandlesHighLoad(testCase)
% Enable metrics
metrics = testCase.TestHost.enableMetrics();
% Send 1000 messages
tic;
for i = 1:1000
testCase.TestHost.publish('sensor.temperature', ...
struct(...
'value', 20.0 + mod(i, 10), ...
'sensor', sprintf('sensor-%d', mod(i, 5))));
end
sendTime = toc;
% Wait for processing
pause(5);
% Verify performance
testCase.verifyLessThan(metrics.averageLatency, 10); % ms
testCase.verifyGreaterThan(metrics.messagesPerSecond, 100);
testCase.verifyEqual(metrics.totalProcessed, 1000);
fprintf('Sent 1000 messages in %.2f seconds\n', sendTime);
end
function testMemoryUsageStable(testCase)
% Get initial memory
initialMemory = memory;
initialUsed = initialMemory.MemUsedMATLAB / 1e6; % MB
% Process messages for 30 seconds
startTime = tic;
messageCount = 0;
while toc(startTime) < 30
testCase.TestHost.publish('sensor.temperature', ...
struct('value', 25.0, 'sensor', 'test'));
pause(0.01); % 100 msg/sec
messageCount = messageCount + 1;
end
% Get final memory
finalMemory = memory;
finalUsed = finalMemory.MemUsedMATLAB / 1e6; % MB
memoryIncrease = finalUsed - initialUsed;
% Memory should not increase significantly
testCase.verifyLessThan(memoryIncrease, 50); % MB
fprintf('Processed %d messages, memory increased by %.1f MB\n', ...
messageCount, memoryIncrease);
end
end
end
// Integration Testing in LabVIEW
//
// === Test Framework Setup ===
// Use NexusTestHost.lvlib for integration testing
//
// 1. Initialize Test Host:
// - Create NexusTestHost DVR
// - Configure modules (cluster array)
// - Add mock modules
// - Call StartTestHost.vi
//
// 2. Module Configuration:
// modules[0]:
// - name: "temperature-monitor"
// - type: "temperature_module"
// - config: {threshold: 80.0}
//
// === Test Cases ===
//
// test_ProcessesTemperatureData.vi:
// 1. Subscribe to "temperature.processed"
// 2. Publish to "sensor.temperature"
// - value: 75.5
// - sensor: "sensor-1"
// 3. WaitForMessage.vi (timeout: 5000ms)
// 4. Assert message received
// 5. Verify payload fields
//
// test_HandlesHighTemperature.vi:
// 1. Send command "set_threshold" = 80.0
// 2. Publish temperature = 85.5
// 3. WaitForMessage "temperature.alert"
// 4. Verify alert severity = "warning"
//
// test_HealthReflectsState.vi:
// 1. Get initial health → should be "healthy"
// 2. Publish "sensor.failed"
// 3. Wait 100ms
// 4. Get health again → should be "degraded"
//
// test_MessagePatterns.vi:
// 1. Subscribe to "temperature.*"
// 2. Publish to:
// - "temperature.reading" ✓
// - "temperature.alert" ✓
// - "humidity.reading" ✗
// 3. Verify only 2 messages received
//
// === Performance Testing ===
//
// test_HighLoad.vi:
// 1. Enable metrics collection
// 2. For Loop (N=1000)
// - Publish temperature data
// - Vary sensor ID (mod 5)
// 3. Wait 5 seconds
// 4. Check metrics:
// - Average latency < 10ms
// - Messages/sec > 100
// - Total processed = 1000
//
// === Test Utilities ===
//
// WaitForMessage.vi:
// - Inputs: topic, timeout
// - Outputs: message, timed out?
// - Uses Queue with timeout
//
// AssertMessagePublished.vi:
// - Inputs: topic, predicate VI ref
// - Searches published messages
// - Calls predicate for matching
//
// MockModule.vi:
// - Inputs: name, handlers
// - Simulates external module
// - Returns canned responses
//
// === Best Practices ===
// - Use TestStand for complex scenarios
// - Implement proper cleanup in all tests
// - Use DVRs for shared test resources
// - Group related tests in sequences
// - Monitor memory during long tests
Mock and Stub Patterns
Creating Test Doubles
Use mocks and stubs to isolate module behavior and simulate external dependencies.
// Creating a mock context
public class MockModuleContext : IModuleContext
{
private readonly Dictionary<string, List<Func<Message, Task>>> _handlers = new();
private readonly List<PublishedMessage> _published = new();
public IReadOnlyList<PublishedMessage> PublishedMessages => _published;
public Task PublishAsync(string topic, object payload)
{
_published.Add(new PublishedMessage(topic, payload, DateTime.UtcNow));
return Task.CompletedTask;
}
public Task SubscribeAsync(string pattern, Func<Message, Task> handler)
{
if (!_handlers.ContainsKey(pattern))
_handlers[pattern] = new();
_handlers[pattern].Add(handler);
return Task.CompletedTask;
}
public async Task<TResponse> RequestAsync<TResponse>(
string topic, object request, TimeSpan timeout)
{
// Simulate request handling
await Task.Delay(10);
// Return mock response based on topic
return topic switch
{
"config.get" => (TResponse)(object)new { value = "test" },
"database.query" => (TResponse)(object)new { results = new[] { 1, 2, 3 } },
_ => throw new TimeoutException($"No handler for {topic}")
};
}
// Helper methods for testing
public async Task SimulateMessageAsync(string topic, object payload)
{
var message = new Message { Topic = topic, Payload = payload };
foreach (var kvp in _handlers)
{
if (MatchesPattern(kvp.Key, topic))
{
foreach (var handler in kvp.Value)
{
await handler(message);
}
}
}
}
public void VerifyPublished(string topic, Func<object, bool> predicate = null)
{
var published = _published.Any(p =>
p.Topic == topic && (predicate == null || predicate(p.Payload)));
if (!published)
throw new Exception($"Expected message on topic '{topic}' was not published");
}
}
// Creating a behavior-driven mock
public class BehaviorMockContext : IModuleContext
{
private readonly Dictionary<string, Func<object, object>> _requestHandlers = new();
private readonly Dictionary<string, Action<object>> _publishAssertions = new();
public void SetupRequest<TRequest, TResponse>(
string topic,
Func<TRequest, TResponse> handler)
{
_requestHandlers[topic] = req => handler((TRequest)req);
}
public void ExpectPublish(string topic, Action<object> assertion)
{
_publishAssertions[topic] = assertion;
}
public Task PublishAsync(string topic, object payload)
{
if (_publishAssertions.TryGetValue(topic, out var assertion))
{
assertion(payload);
}
return Task.CompletedTask;
}
public async Task<TResponse> RequestAsync<TResponse>(
string topic, object request, TimeSpan timeout)
{
if (_requestHandlers.TryGetValue(topic, out var handler))
{
await Task.Yield(); // Simulate async
return (TResponse)handler(request);
}
throw new InvalidOperationException($"No handler setup for {topic}");
}
}
// Using mocks in tests
[Fact]
public void TestWithBehaviorMock()
{
// Arrange
var mockContext = new BehaviorMockContext();
// Setup expected behavior
mockContext.SetupRequest<ConfigRequest, ConfigResponse>(
"config.get",
req => new ConfigResponse { Value = req.Key + "_value" });
mockContext.ExpectPublish("status.changed", payload =>
{
dynamic status = payload;
Assert.Equal("ready", status.state);
});
var module = new MyModule();
module.Initialize(mockContext);
// Act
module.Start();
// Assert - assertions run automatically
}
from unittest.mock import Mock, AsyncMock, MagicMock
from datetime import datetime
import asyncio
class MockModuleContext:
"""Mock implementation of module context for testing"""
def __init__(self):
self.published_messages = []
self.subscriptions = {}
self.request_handlers = {}
async def publish(self, topic, payload):
self.published_messages.append({
'topic': topic,
'payload': payload,
'timestamp': datetime.utcnow()
})
# Simulate message delivery
for pattern, handler in self.subscriptions.items():
if self._matches_pattern(pattern, topic):
message = Message(topic=topic, payload=payload)
await handler(message)
async def subscribe(self, pattern, handler):
self.subscriptions[pattern] = handler
async def request(self, topic, request, timeout=30):
await asyncio.sleep(0.01) # Simulate network delay
if topic in self.request_handlers:
return self.request_handlers[topic](request)
# Default responses
if topic == "config.get":
return {"value": "test"}
elif topic == "database.query":
return {"results": [1, 2, 3]}
raise TimeoutError(f"No handler for {topic}")
# Helper methods
def setup_request_handler(self, topic, handler):
self.request_handlers[topic] = handler
def get_published(self, topic=None):
if topic:
return [m for m in self.published_messages if m['topic'] == topic]
return self.published_messages
def assert_published(self, topic, predicate=None):
messages = self.get_published(topic)
assert len(messages) > 0, f"No messages published to {topic}"
if predicate:
assert any(predicate(m['payload']) for m in messages), \
f"No message matching predicate on {topic}"
async def simulate_message(self, topic, payload):
"""Simulate receiving a message"""
message = Message(topic=topic, payload=payload)
for pattern, handler in self.subscriptions.items():
if self._matches_pattern(pattern, topic):
await handler(message)
def _matches_pattern(self, pattern, topic):
import re
regex = pattern.replace(".", r"\.").replace("*", "[^.]+")
return re.match(f"^{regex}$", topic) is not None
# Using pytest-mock
@pytest.fixture
def mock_context(mocker):
"""Fixture providing a mock context with common setups"""
context = mocker.Mock()
context.publish = mocker.AsyncMock()
context.subscribe = mocker.AsyncMock()
context.request = mocker.AsyncMock()
# Setup common responses
context.request.side_effect = lambda topic, req, timeout=30: {
"config.get": {"value": "test"},
"database.query": {"results": [1, 2, 3]}
}.get(topic, {})
return context
# Behavior-driven mock
class BehaviorMockContext:
"""Mock that verifies behavior"""
def __init__(self):
self.expectations = []
self.request_stubs = {}
def expect_publish(self, topic, assertion):
self.expectations.append(('publish', topic, assertion))
def stub_request(self, topic, response_func):
self.request_stubs[topic] = response_func
async def publish(self, topic, payload):
for exp_type, exp_topic, assertion in self.expectations:
if exp_type == 'publish' and exp_topic == topic:
assertion(payload)
self.expectations.remove((exp_type, exp_topic, assertion))
return
raise AssertionError(f"Unexpected publish to {topic}")
async def request(self, topic, request, timeout=30):
if topic in self.request_stubs:
return self.request_stubs[topic](request)
raise ValueError(f"No stub for request to {topic}")
def verify_all_expectations_met(self):
assert len(self.expectations) == 0, \
f"Unmet expectations: {self.expectations}"
# Using in tests
@pytest.mark.asyncio
async def test_with_behavior_mock():
# Arrange
mock = BehaviorMockContext()
# Setup expectations
mock.expect_publish("status.changed",
lambda payload: payload['state'] == 'ready')
mock.stub_request("config.get",
lambda req: {"value": f"{req['key']}_value"})
module = MyModule()
module.initialize(mock)
# Act
await module.start()
# Assert
mock.verify_all_expectations_met()
# Creating a spy that records interactions
class SpyContext:
def __init__(self, real_context):
self.real_context = real_context
self.interactions = []
async def publish(self, topic, payload):
self.interactions.append(('publish', topic, payload))
return await self.real_context.publish(topic, payload)
async def subscribe(self, pattern, handler):
self.interactions.append(('subscribe', pattern))
return await self.real_context.subscribe(pattern, handler)
def get_interaction_count(self, method, topic=None):
return len([i for i in self.interactions
if i[0] == method and (topic is None or i[1] == topic)])
// Google Mock based mocking
class MockModuleContext : public IModuleContext {
public:
MOCK_METHOD(void, publish,
(const std::string& topic, const json& payload), (override));
MOCK_METHOD(void, subscribe,
(const std::string& pattern, MessageHandler handler), (override));
MOCK_METHOD(json, request,
(const std::string& topic, const json& request, int timeout), (override));
// Helper to capture published messages
std::vector<std::pair<std::string, json>> published_messages;
void capture_publishes() {
ON_CALL(*this, publish(_, _))
.WillByDefault([this](const std::string& topic, const json& payload) {
published_messages.emplace_back(topic, payload);
});
}
};
// Manual mock implementation
class ManualMockContext : public IModuleContext {
private:
std::unordered_map<std::string, MessageHandler> handlers_;
std::vector<std::pair<std::string, json>> published_;
std::unordered_map<std::string, std::function<json(const json&)>> request_handlers_;
public:
void publish(const std::string& topic, const json& payload) override {
published_.emplace_back(topic, payload);
// Simulate message delivery
for (const auto& [pattern, handler] : handlers_) {
if (matches_pattern(pattern, topic)) {
Message msg{topic, payload};
handler(msg);
}
}
}
void subscribe(const std::string& pattern, MessageHandler handler) override {
handlers_[pattern] = handler;
}
json request(const std::string& topic, const json& request, int timeout) override {
if (request_handlers_.count(topic)) {
return request_handlers_[topic](request);
}
// Default responses
if (topic == "config.get") {
return {{"value", "test"}};
} else if (topic == "database.query") {
return {{"results", {1, 2, 3}}};
}
throw std::runtime_error("No handler for " + topic);
}
// Test helpers
void setup_request(const std::string& topic,
std::function<json(const json&)> handler) {
request_handlers_[topic] = handler;
}
void assert_published(const std::string& topic,
std::function<bool(const json&)> predicate = nullptr) {
auto found = std::any_of(published_.begin(), published_.end(),
[&](const auto& p) {
return p.first == topic &&
(!predicate || predicate(p.second));
});
ASSERT_TRUE(found) << "Expected message on " << topic;
}
std::vector<json> get_published(const std::string& topic) const {
std::vector<json> result;
for (const auto& [t, payload] : published_) {
if (t == topic) {
result.push_back(payload);
}
}
return result;
}
};
// Behavior verification mock
class StrictMockContext : public IModuleContext {
private:
struct Expectation {
std::string method;
std::string topic;
std::function<void(const json&)> verifier;
bool satisfied = false;
};
std::vector<Expectation> expectations_;
public:
void expect_publish(const std::string& topic,
std::function<void(const json&)> verifier) {
expectations_.push_back({"publish", topic, verifier, false});
}
void publish(const std::string& topic, const json& payload) override {
for (auto& exp : expectations_) {
if (exp.method == "publish" && exp.topic == topic && !exp.satisfied) {
exp.verifier(payload);
exp.satisfied = true;
return;
}
}
FAIL() << "Unexpected publish to " << topic;
}
~StrictMockContext() {
verify_all_satisfied();
}
void verify_all_satisfied() {
for (const auto& exp : expectations_) {
EXPECT_TRUE(exp.satisfied)
<< "Unsatisfied expectation: " << exp.method
<< " on " << exp.topic;
}
}
};
// Using in tests
TEST(ModuleTest, WithGoogleMock) {
// Strict mock - all calls must be expected
::testing::StrictMock<MockModuleContext> mock_context;
// Set expectations
EXPECT_CALL(mock_context, subscribe("sensor.*", _))
.Times(1);
EXPECT_CALL(mock_context, publish("status.ready", _))
.Times(1)
.WillOnce([](const std::string&, const json& payload) {
EXPECT_EQ(payload["state"], "ready");
});
EXPECT_CALL(mock_context, request("config.get", _, _))
.WillOnce(Return(json{{"threshold", 80.0}}));
// Test module
MyModule module;
module.initialize(&mock_context);
module.start();
}
classdef MockModuleContext < handle
% Mock implementation of module context
properties
PublishedMessages = {}
Subscriptions = containers.Map()
RequestHandlers = containers.Map()
CallLog = {}
end
methods
function publish(obj, topic, payload)
% Record published message
obj.PublishedMessages{end+1} = struct(...
'topic', topic, ...
'payload', payload, ...
'timestamp', datetime('now'));
obj.logCall('publish', topic);
% Simulate message delivery
keys = obj.Subscriptions.keys;
for i = 1:length(keys)
pattern = keys{i};
if obj.matchesPattern(pattern, topic)
handler = obj.Subscriptions(pattern);
message = struct('topic', topic, 'payload', payload);
handler(message);
end
end
end
function subscribe(obj, pattern, handler)
obj.Subscriptions(pattern) = handler;
obj.logCall('subscribe', pattern);
end
function response = request(obj, topic, request, timeout)
if nargin < 4
timeout = 30;
end
obj.logCall('request', topic);
% Check for custom handler
if obj.RequestHandlers.isKey(topic)
handler = obj.RequestHandlers(topic);
response = handler(request);
return;
end
% Default responses
switch topic
case 'config.get'
response = struct('value', 'test');
case 'database.query'
response = struct('results', [1, 2, 3]);
otherwise
error('No handler for topic: %s', topic);
end
end
% Test helper methods
function setupRequest(obj, topic, handler)
obj.RequestHandlers(topic) = handler;
end
function assertPublished(obj, topic, predicate)
found = false;
for i = 1:length(obj.PublishedMessages)
msg = obj.PublishedMessages{i};
if strcmp(msg.topic, topic)
if nargin < 3 || predicate(msg.payload)
found = true;
break;
end
end
end
assert(found, 'Expected message on topic %s not found', topic);
end
function messages = getPublished(obj, topic)
messages = {};
for i = 1:length(obj.PublishedMessages)
msg = obj.PublishedMessages{i};
if strcmp(msg.topic, topic)
messages{end+1} = msg.payload;
end
end
end
function simulateMessage(obj, topic, payload)
% Simulate receiving a message
keys = obj.Subscriptions.keys;
for i = 1:length(keys)
pattern = keys{i};
if obj.matchesPattern(pattern, topic)
handler = obj.Subscriptions(pattern);
message = struct('topic', topic, 'payload', payload);
handler(message);
end
end
end
function logCall(obj, method, arg)
obj.CallLog{end+1} = struct(...
'method', method, ...
'argument', arg, ...
'timestamp', datetime('now'));
end
function count = getCallCount(obj, method, arg)
count = 0;
for i = 1:length(obj.CallLog)
call = obj.CallLog{i};
if strcmp(call.method, method)
if nargin < 3 || strcmp(call.argument, arg)
count = count + 1;
end
end
end
end
end
methods (Access = private)
function matches = matchesPattern(~, pattern, topic)
% Simple wildcard matching
regexPattern = strrep(pattern, '.', '\.');
regexPattern = strrep(regexPattern, '*', '[^.]+');
regexPattern = ['^' regexPattern '$'];
matches = ~isempty(regexp(topic, regexPattern, 'once'));
end
end
end
% Behavior verification mock
classdef StrictMockContext < handle
properties (Access = private)
Expectations = {}
end
methods
function expectPublish(obj, topic, verifier)
obj.Expectations{end+1} = struct(...
'method', 'publish', ...
'topic', topic, ...
'verifier', verifier, ...
'satisfied', false);
end
function publish(obj, topic, payload)
for i = 1:length(obj.Expectations)
exp = obj.Expectations{i};
if strcmp(exp.method, 'publish') && ...
strcmp(exp.topic, topic) && ~exp.satisfied
% Run verifier
exp.verifier(payload);
obj.Expectations{i}.satisfied = true;
return;
end
end
error('Unexpected publish to %s', topic);
end
function verifyAllSatisfied(obj)
for i = 1:length(obj.Expectations)
exp = obj.Expectations{i};
assert(exp.satisfied, ...
'Unsatisfied expectation: %s on %s', ...
exp.method, exp.topic);
end
end
end
end
% Using in tests
classdef TestWithMocks < matlab.unittest.TestCase
methods (Test)
function testWithManualMock(testCase)
% Create mock
mock = MockModuleContext();
% Setup request handler
mock.setupRequest('config.get', ...
@(req) struct('value', [req.key '_value']));
% Create and test module
module = MyModule();
module.initialize(mock);
module.start();
% Verify behavior
mock.assertPublished('status.ready', ...
@(payload) strcmp(payload.state, 'ready'));
testCase.verifyEqual(mock.getCallCount('subscribe'), 2);
end
function testWithStrictMock(testCase)
% Create strict mock
mock = StrictMockContext();
% Set expectations
mock.expectPublish('status.ready', ...
@(payload) assert(strcmp(payload.state, 'ready')));
% Test module
module = MyModule();
module.initialize(mock);
module.start();
% Verify all expectations met
mock.verifyAllSatisfied();
end
end
end
// Mock Patterns in LabVIEW
//
// === Creating Mock Context ===
// MockContext.lvclass:
//
// Private Data:
// - Published Messages (Array)
// - Subscriptions (Map)
// - Request Handlers (Map)
// - Call Log (Array)
//
// === Mock Methods ===
//
// Publish.vi:
// 1. Add to Published Messages array
// 2. Log call with timestamp
// 3. Check subscriptions for match
// 4. Call matching handlers
//
// Subscribe.vi:
// 1. Store handler in map
// 2. Log subscription call
//
// Request.vi:
// 1. Check custom handlers first
// 2. Return default responses
// 3. Log request call
//
// === Test Helper VIs ===
//
// SetupRequest.vi:
// - Input: topic, handler VI ref
// - Stores custom handler
//
// AssertPublished.vi:
// - Input: topic, predicate VI ref
// - Searches published messages
// - Calls predicate if provided
// - Assert if not found
//
// GetPublished.vi:
// - Input: topic (optional)
// - Output: filtered messages
//
// SimulateMessage.vi:
// - Input: topic, payload
// - Finds matching subscriptions
// - Calls handlers
//
// === Behavior Mock ===
// StrictMockContext.lvclass:
//
// ExpectPublish.vi:
// - Input: topic, verifier VI ref
// - Adds to expectations list
//
// Publish.vi (Override):
// - Find matching expectation
// - Run verifier
// - Mark satisfied
// - Error if unexpected
//
// VerifyAllSatisfied.vi:
// - Check all expectations
// - Error if any unsatisfied
//
// === Usage Example ===
//
// test_ModuleWithMock.vi:
// 1. Create MockContext DVR
// 2. Setup custom handlers:
// - "config.get" → return test config
// 3. Initialize module with mock
// 4. Run test scenario
// 5. Assert published messages
// 6. Check call counts
//
// === Advanced Patterns ===
//
// Spy Pattern:
// - Wrap real context
// - Record all interactions
// - Pass through to real
//
// Stub Pattern:
// - Minimal implementation
// - Fixed responses
// - No behavior verification
//
// === Best Practices ===
// - Use DVRs for mock objects
// - Clear mock state between tests
// - Group related mocks in library
// - Document expected behavior
CI/CD Integration
Automated Testing Pipeline
Integrate NEXUS-1 module testing into your continuous integration pipeline.
# GitHub Actions example
name: Module Tests
on:
push:
branches: [ main, develop ]
pull_request:
branches: [ main ]
jobs:
test:
runs-on: ubuntu-latest
services:
nexus-test:
image: nexus/test-environment:latest
ports:
- 5000:5000
options: --health-cmd "nexus health" --health-interval 10s
steps:
- uses: actions/checkout@v3
- name: Setup .NET
uses: actions/setup-dotnet@v3
with:
dotnet-version: 9.0.x
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: '3.11'
- name: Install dependencies
run: |
dotnet restore
pip install -r requirements.txt
- name: Run unit tests
run: |
dotnet test --logger "trx;LogFileName=test-results.trx"
pytest tests/unit --junit-xml=pytest-results.xml
- name: Run integration tests
env:
NEXUS_TEST_HOST: localhost:5000
run: |
dotnet test tests/integration --filter Category=Integration
pytest tests/integration -m integration
- name: Upload test results
uses: actions/upload-artifact@v3
if: always()
with:
name: test-results
path: |
**/*.trx
**/*-results.xml
- name: Publish test report
uses: dorny/test-reporter@v1
if: always()
with:
name: Module Tests
path: '**/*-results.xml'
reporter: java-junit
# GitLab CI example
stages:
- build
- test
- deploy
variables:
NEXUS_TEST_IMAGE: "nexus/test-environment:latest"
test:unit:
stage: test
script:
- dotnet test tests/unit --collect:"XPlat Code Coverage"
- pytest tests/unit --cov=modules --cov-report=xml
artifacts:
reports:
coverage_report:
coverage_format: cobertura
path: coverage.xml
test:integration:
stage: test
services:
- name: $NEXUS_TEST_IMAGE
alias: nexus-test
script:
- export NEXUS_TEST_HOST=nexus-test:5000
- dotnet test tests/integration
- pytest tests/integration -m integration
artifacts:
reports:
junit:
- '**/test-results.xml'
# Jenkins Pipeline example
pipeline {
agent any
stages {
stage('Setup') {
steps {
sh 'docker-compose -f test-env.yml up -d'
sh 'dotnet restore'
sh 'pip install -r requirements.txt'
}
}
stage('Unit Tests') {
steps {
sh 'dotnet test tests/unit --logger trx'
sh 'pytest tests/unit --junit-xml=unit-results.xml'
}
}
stage('Integration Tests') {
environment {
NEXUS_TEST_HOST = 'localhost:5000'
}
steps {
sh 'dotnet test tests/integration'
sh 'pytest tests/integration -m integration'
}
}
stage('Performance Tests') {
steps {
sh 'dotnet test tests/performance --filter Category=Performance'
}
}
}
post {
always {
junit '**/test-results.xml'
publishHTML([
reportDir: 'test-reports',
reportFiles: 'index.html',
reportName: 'Test Report'
])
sh 'docker-compose -f test-env.yml down'
}
}
}
Testing Best Practices
Test Organization
- Arrange-Act-Assert: Structure tests clearly
- One Assertion Per Test: Keep tests focused
- Descriptive Names: Test names should describe behavior
- Test Isolation: Tests should not depend on each other
- Fast Tests: Unit tests should run in milliseconds
Module Testing Checklist
- ✓ Test all lifecycle methods (OnInitialized, OnStarting, etc.)
- ✓ Test message handling for all subscribed patterns
- ✓ Test health check logic under various conditions
- ✓ Test error handling and recovery
- ✓ Test configuration changes
- ✓ Test resource cleanup in Dispose
- ✓ Test concurrent message handling
- ✓ Test performance under load
- ✓ Test integration with other modules
- ✓ Test negative scenarios and edge cases
Common Testing Patterns
Test Data Builders
Create builders for complex test data to improve readability and maintainability.
Test Fixtures
Share common setup between tests while maintaining isolation.
Parameterized Tests
Test multiple scenarios with the same test logic.
Integration Test Scenarios
Test complete workflows across multiple modules.
Observability
The Nexus-1 SDK provides comprehensive observability features through OpenTelemetry integration, enabling distributed tracing, metrics collection, and structured logging.
Distributed Tracing
Track requests across multiple modules and systems:
OpenTelemetry Integration
using Nexus1.SDK.Observability;
using OpenTelemetry.Trace;
public class TracedModule : ModuleBase
{
private readonly ITracer _tracer;
public TracedModule()
{
_tracer = TracerProvider.Default.GetTracer("MyModule");
}
public async Task ProcessRequestAsync(Request request)
{
// Start a new span
using var span = _tracer.StartActiveSpan("ProcessRequest");
try
{
// Add span attributes
span.SetAttribute("request.id", request.Id);
span.SetAttribute("request.type", request.Type);
// Create child spans for sub-operations
using (var dbSpan = _tracer.StartActiveSpan("database.query"))
{
dbSpan.SetAttribute("db.statement", "SELECT * FROM data WHERE id = ?");
var data = await QueryDatabaseAsync(request.Id);
}
// Trace external service calls
using (var httpSpan = _tracer.StartActiveSpan("http.request"))
{
httpSpan.SetAttribute("http.method", "POST");
httpSpan.SetAttribute("http.url", "https://api.example.com/process");
var result = await CallExternalServiceAsync(data);
}
span.SetStatus(Status.Ok);
}
catch (Exception ex)
{
span.RecordException(ex);
span.SetStatus(Status.Error.WithDescription(ex.Message));
throw;
}
}
}
from nexus_sdk.observability import tracer
from opentelemetry import trace
class TracedModule(ModuleBase):
def __init__(self):
super().__init__()
self.tracer = trace.get_tracer("MyModule")
async def process_request(self, request):
# Start a new span
with self.tracer.start_as_current_span("ProcessRequest") as span:
try:
# Add span attributes
span.set_attribute("request.id", request.id)
span.set_attribute("request.type", request.type)
# Create child spans
with self.tracer.start_as_current_span("database.query") as db_span:
db_span.set_attribute("db.statement", "SELECT * FROM data WHERE id = ?")
data = await self.query_database(request.id)
# Trace external calls
with self.tracer.start_as_current_span("http.request") as http_span:
http_span.set_attribute("http.method", "POST")
http_span.set_attribute("http.url", "https://api.example.com/process")
result = await self.call_external_service(data)
span.set_status(trace.Status(trace.StatusCode.OK))
except Exception as ex:
span.record_exception(ex)
span.set_status(trace.Status(trace.StatusCode.ERROR, str(ex)))
raise
#include <nexus/observability.h>
#include <opentelemetry/trace/tracer.h>
class TracedModule : public ModuleBase {
private:
std::shared_ptr<opentelemetry::trace::Tracer> tracer;
public:
TracedModule() : tracer(opentelemetry::trace::Provider::GetTracerProvider()->GetTracer("MyModule")) {}
async_task<void> process_request(const Request& request) {
// Start a new span
auto span = tracer->StartSpan("ProcessRequest");
auto scope = tracer->WithActiveSpan(span);
try {
// Add span attributes
span->SetAttribute("request.id", request.id);
span->SetAttribute("request.type", request.type);
// Create child spans
{
auto db_span = tracer->StartSpan("database.query");
auto db_scope = tracer->WithActiveSpan(db_span);
db_span->SetAttribute("db.statement", "SELECT * FROM data WHERE id = ?");
auto data = co_await query_database(request.id);
}
// Trace external calls
{
auto http_span = tracer->StartSpan("http.request");
auto http_scope = tracer->WithActiveSpan(http_span);
http_span->SetAttribute("http.method", "POST");
http_span->SetAttribute("http.url", "https://api.example.com/process");
auto result = co_await call_external_service(data);
}
span->SetStatus(opentelemetry::trace::StatusCode::kOk);
} catch (const std::exception& ex) {
span->RecordException(ex);
span->SetStatus(opentelemetry::trace::StatusCode::kError, ex.what());
throw;
}
}
};
Metrics Collection
Collect and export metrics using OpenTelemetry:
Metrics Implementation
using Nexus1.SDK.Metrics;
using OpenTelemetry.Metrics;
public class MetricModule : ModuleBase
{
private readonly Counter<long> _processedCounter;
private readonly Histogram<double> _processingTime;
private readonly ObservableGauge<int> _queueSize;
public MetricModule(IMeterProvider meterProvider)
{
var meter = meterProvider.GetMeter("MyModule", "1.0.0");
// Create counter for processed items
_processedCounter = meter.CreateCounter<long>(
"items_processed",
"items",
"Number of items processed");
// Create histogram for processing time
_processingTime = meter.CreateHistogram<double>(
"processing_duration",
"ms",
"Time taken to process items");
// Create observable gauge for queue size
_queueSize = meter.CreateObservableGauge<int>(
"queue_size",
() => GetCurrentQueueSize(),
"items",
"Current number of items in queue");
}
public async Task ProcessItemAsync(Item item)
{
var stopwatch = Stopwatch.StartNew();
try
{
await ProcessAsync(item);
// Record metrics
_processedCounter.Add(1, new KeyValuePair<string, object?>("status", "success"));
_processingTime.Record(stopwatch.ElapsedMilliseconds);
}
catch (Exception ex)
{
_processedCounter.Add(1, new KeyValuePair<string, object?>("status", "error"));
throw;
}
}
}
from nexus_sdk.metrics import get_meter
from opentelemetry.metrics import Counter, Histogram, ObservableGauge
import time
class MetricModule(ModuleBase):
def __init__(self, meter_provider):
super().__init__()
meter = meter_provider.get_meter("MyModule", "1.0.0")
# Create counter for processed items
self.processed_counter = meter.create_counter(
"items_processed",
unit="items",
description="Number of items processed"
)
# Create histogram for processing time
self.processing_time = meter.create_histogram(
"processing_duration",
unit="ms",
description="Time taken to process items"
)
# Create observable gauge for queue size
self.queue_size = meter.create_observable_gauge(
"queue_size",
callbacks=[self.get_current_queue_size],
unit="items",
description="Current number of items in queue"
)
async def process_item(self, item):
start_time = time.time()
try:
await self.process(item)
# Record metrics
self.processed_counter.add(1, {"status": "success"})
duration_ms = (time.time() - start_time) * 1000
self.processing_time.record(duration_ms)
except Exception as ex:
self.processed_counter.add(1, {"status": "error"})
raise
#include <nexus/metrics.h>
#include <opentelemetry/metrics/meter.h>
#include <chrono>
class MetricModule : public ModuleBase {
private:
std::shared_ptr<opentelemetry::metrics::Counter<uint64_t>> processed_counter;
std::shared_ptr<opentelemetry::metrics::Histogram<double>> processing_time;
std::shared_ptr<opentelemetry::metrics::ObservableGauge<int>> queue_size;
public:
MetricModule(std::shared_ptr<opentelemetry::metrics::MeterProvider> meter_provider) {
auto meter = meter_provider->GetMeter("MyModule", "1.0.0");
// Create counter for processed items
processed_counter = meter->CreateUInt64Counter(
"items_processed",
"Number of items processed",
"items"
);
// Create histogram for processing time
processing_time = meter->CreateDoubleHistogram(
"processing_duration",
"Time taken to process items",
"ms"
);
// Create observable gauge for queue size
queue_size = meter->CreateInt64ObservableGauge(
"queue_size",
"Current number of items in queue",
"items"
);
queue_size->AddCallback([this](opentelemetry::metrics::ObserverResult observer_result) {
observer_result.Observe(get_current_queue_size());
});
}
async_task<void> process_item(const Item& item) {
auto start = std::chrono::steady_clock::now();
try {
co_await process(item);
// Record metrics
processed_counter->Add(1, {{"status", "success"}});
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::steady_clock::now() - start
).count();
processing_time->Record(duration);
} catch (const std::exception& ex) {
processed_counter->Add(1, {{"status", "error"}});
throw;
}
}
};
Structured Logging
Use structured logging with correlation across distributed systems:
Enhanced Logging
using Nexus1.SDK.Logging;
using Microsoft.Extensions.Logging;
public class LoggingModule : ModuleBase
{
private readonly ILogger<LoggingModule> _logger;
public async Task ProcessAsync(WorkItem item)
{
// Automatic correlation ID injection
using (_logger.BeginScope(new Dictionary<string, object>
{
["CorrelationId"] = item.CorrelationId,
["WorkItemId"] = item.Id,
["WorkItemType"] = item.Type
}))
{
_logger.LogInformation("Starting work item processing");
try
{
var result = await ProcessWorkItemAsync(item);
_logger.LogInformation("Work item processed successfully",
new { ResultCode = result.Code, Duration = result.Duration });
}
catch (Exception ex)
{
_logger.LogError(ex, "Failed to process work item",
new { ItemId = item.Id, ErrorCode = ex.HResult });
throw;
}
}
}
}
Health Checks
Implement comprehensive health checks for monitoring:
Health Check Implementation
using Nexus1.SDK.Health;
public class HealthyModule : ModuleBase, IHealthCheck
{
private readonly IHealthCheckService _healthService;
public override async Task InitializeAsync(IModuleContext context)
{
await base.InitializeAsync(context);
// Register health checks
_healthService = context.GetService<IHealthCheckService>();
_healthService.RegisterCheck("database", CheckDatabaseHealth);
_healthService.RegisterCheck("api", CheckApiHealth);
_healthService.RegisterCheck("cache", CheckCacheHealth);
}
private async Task<HealthCheckResult> CheckDatabaseHealth()
{
try
{
await _database.PingAsync();
return HealthCheckResult.Healthy("Database connection is healthy");
}
catch (Exception ex)
{
return HealthCheckResult.Unhealthy("Database connection failed", ex);
}
}
private async Task<HealthCheckResult> CheckApiHealth()
{
var response = await _httpClient.GetAsync("/health");
if (response.IsSuccessStatusCode)
{
return HealthCheckResult.Healthy("API is responsive");
}
return HealthCheckResult.Degraded($"API returned {response.StatusCode}");
}
}
Dashboards and Alerting
Configure dashboards and alerts for your modules:
Monitoring Configuration
# Prometheus configuration for metrics
scrape_configs:
- job_name: 'nexus-modules'
static_configs:
- targets: ['localhost:9090']
metrics_path: '/metrics'
# Example Grafana dashboard query
rate(items_processed[5m])
# AlertManager rule
groups:
- name: module_alerts
rules:
- alert: HighErrorRate
expr: rate(items_processed{status="error"}[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "High error rate in module {{ $labels.module }}"
Developer Tools
The nexus-dev CLI provides comprehensive tooling for Nexus-1 module development:
Project Generation
Create New Modules
# Generate a new module project
nexus-dev generate --name MyAwesomeModule --type data-processor --language csharp
# Generate with specific features
nexus-dev generate --name AdvancedModule \
--type service \
--features messaging,state,security \
--test-framework nunit \
--include-docker
# Generate from custom template
nexus-dev generate --template https://github.com/company/nexus-templates/advanced-service
Module Validation
Validate Modules and Manifests
# Validate module manifest
nexus-dev validate manifest nexus-module.yaml
# Validate with specific rules
nexus-dev validate manifest nexus-module.yaml \
--rules security,performance,compatibility
# Validate entire project
nexus-dev validate project ./src \
--include-tests \
--check-dependencies \
--verify-signatures
Module Packaging
Package for Distribution
# Create a module package
nexus-dev package --project MyModule.csproj --output ./dist
# Package with custom metadata
nexus-dev package \
--project MyModule.csproj \
--version 2.1.0 \
--release-notes "Fixed critical bug in message handling" \
--tags "iot,sensors,telemetry" \
--sign-with certificate.pfx
# Create multi-platform package
nexus-dev package \
--project MyModule.csproj \
--platforms linux-x64,win-x64,osx-arm64 \
--include-runtime
Development Server
Local Development Environment
# Start local development server
nexus-dev serve --module ./bin/Debug/MyModule.dll
# Serve with hot reload
nexus-dev serve \
--module ./bin/Debug/MyModule.dll \
--watch ./src \
--reload-on-change
# Serve with mock services
nexus-dev serve \
--module ./bin/Debug/MyModule.dll \
--mock-services messagebus,statestore \
--mock-data ./test-data.json
Testing Tools
Test Execution and Coverage
# Run module tests
nexus-dev test --project MyModule.Tests.csproj
# Run with coverage
nexus-dev test \
--project MyModule.Tests.csproj \
--coverage \
--coverage-format cobertura \
--output-report ./coverage
# Run integration tests
nexus-dev test integration \
--module MyModule.dll \
--test-host local \
--scenarios ./integration-tests
Module Analysis
Code Quality and Best Practices
# Analyze module for best practices
nexus-dev analyze --project ./src
# Detailed analysis with recommendations
nexus-dev analyze \
--project ./src \
--checks security,performance,maintainability \
--output-format json \
--output-file analysis-report.json
# Compare with baseline
nexus-dev analyze \
--project ./src \
--baseline ./analysis-baseline.json \
--fail-on-regression
Documentation Generation
API Documentation
# Generate API documentation
nexus-dev docs generate --project MyModule.csproj --output ./docs
# Generate with custom templates
nexus-dev docs generate \
--project MyModule.csproj \
--template docfx \
--include-samples \
--include-diagrams
# Generate module manifest documentation
nexus-dev docs manifest \
--input nexus-module.yaml \
--format markdown \
--output MODULE.md
Deployment Tools
Deploy to Nexus Hosts
# Deploy to local Nexus host
nexus-dev deploy --package MyModule.nupkg --host localhost:5000
# Deploy with configuration
nexus-dev deploy \
--package MyModule.nupkg \
--host production.nexus.local \
--config ./configs/production.json \
--wait-for-healthy
# Rolling deployment
nexus-dev deploy rolling \
--package MyModule.nupkg \
--hosts host1,host2,host3 \
--batch-size 1 \
--health-check-interval 30s
Debugging Techniques
Effective debugging is crucial for developing reliable NEXUS-1 modules. This section covers tools and techniques for diagnosing and resolving issues during development and production.
Logging and Tracing
Using the SDK Logger
The NEXUS-1 SDK provides structured logging capabilities for all modules. Use appropriate log levels and structured data for effective debugging.
public class TemperatureSensorModule : ModuleBase
{
protected override void OnInitialized()
{
// Log levels: Trace, Debug, Info, Warning, Error, Fatal
Logger.LogInformation("Module initialized successfully");
// Structured logging with properties
Logger.LogDebug("Sensor configuration loaded", new {
SensorCount = 5,
SamplingRate = 100,
BufferSize = 1000
});
// Log with correlation ID for tracing
using (Logger.BeginScope(new { CorrelationId = Guid.NewGuid() }))
{
Logger.LogInformation("Starting sensor calibration");
CalibrateSensors();
Logger.LogInformation("Calibration completed");
}
}
private async Task ProcessSensorData(SensorReading reading)
{
// Trace level for detailed debugging
Logger.LogTrace("Processing reading: {@Reading}", reading);
try
{
var result = await ValidateReading(reading);
Logger.LogDebug("Validation result: {Result}", result);
if (!result.IsValid)
{
Logger.LogWarning("Invalid reading detected: {Reason}",
result.ValidationError);
}
}
catch (Exception ex)
{
// Log exceptions with full context
Logger.LogError(ex, "Failed to process sensor reading {SensorId}",
reading.SensorId);
throw;
}
}
// Performance logging
private async Task LogPerformance(string operation, Func> action)
{
var sw = Stopwatch.StartNew();
try
{
var result = await action();
Logger.LogDebug("{Operation} completed in {ElapsedMs}ms",
operation, sw.ElapsedMilliseconds);
return result;
}
catch (Exception ex)
{
Logger.LogError(ex, "{Operation} failed after {ElapsedMs}ms",
operation, sw.ElapsedMilliseconds);
throw;
}
}
}
import time
import traceback
from contextlib import contextmanager
from functools import wraps
class TemperatureSensorModule(Module):
def on_initialized(self):
# Log levels: TRACE, DEBUG, INFO, WARNING, ERROR, FATAL
self.logger.info("Module initialized successfully")
# Structured logging with extra fields
self.logger.debug("Sensor configuration loaded", extra={
'sensor_count': 5,
'sampling_rate': 100,
'buffer_size': 1000
})
# Log with context manager for correlation
with self.logger.context(correlation_id=str(uuid.uuid4())):
self.logger.info("Starting sensor calibration")
self.calibrate_sensors()
self.logger.info("Calibration completed")
async def process_sensor_data(self, reading):
# Trace level for detailed debugging
self.logger.trace(f"Processing reading: {reading}")
try:
result = await self.validate_reading(reading)
self.logger.debug(f"Validation result: {result}")
if not result.is_valid:
self.logger.warning(
f"Invalid reading detected: {result.validation_error}",
extra={'sensor_id': reading.sensor_id}
)
except Exception as e:
# Log exceptions with full traceback
self.logger.error(
f"Failed to process sensor reading {reading.sensor_id}",
exc_info=True,
extra={'reading': reading.__dict__}
)
raise
# Performance logging decorator
def log_performance(operation_name):
def decorator(func):
@wraps(func)
async def wrapper(self, *args, **kwargs):
start_time = time.perf_counter()
try:
result = await func(self, *args, **kwargs)
elapsed = (time.perf_counter() - start_time) * 1000
self.logger.debug(
f"{operation_name} completed in {elapsed:.2f}ms"
)
return result
except Exception as e:
elapsed = (time.perf_counter() - start_time) * 1000
self.logger.error(
f"{operation_name} failed after {elapsed:.2f}ms",
exc_info=True
)
raise
return wrapper
return decorator
# Conditional debug logging
def debug_dump_state(self):
if self.logger.is_debug_enabled():
state = {
'active_sensors': len(self.sensors),
'buffer_usage': self.buffer.size(),
'last_reading': self.last_reading,
'error_count': self.error_count
}
self.logger.debug(f"Module state dump: {state}")
}
#include <chrono>
#include <fmt/format.h>
class TemperatureSensorModule : public nexus::ModuleBase {
protected:
void on_initialized() override {
// Log levels: trace, debug, info, warn, error, fatal
logger()->info("Module initialized successfully");
// Structured logging with fmt
logger()->debug("Sensor configuration loaded: sensors={}, rate={}, buffer={}",
sensor_count_, sampling_rate_, buffer_size_);
// Scoped logging for correlation
auto scope = logger()->with_fields({
{"correlation_id", generate_uuid()},
{"operation", "calibration"}
});
scope->info("Starting sensor calibration");
calibrate_sensors();
scope->info("Calibration completed");
}
private:
void process_sensor_data(const SensorReading& reading) {
// Trace level for verbose debugging
logger()->trace("Processing reading: sensor={}, value={}, timestamp={}",
reading.sensor_id, reading.value, reading.timestamp);
try {
auto result = validate_reading(reading);
logger()->debug("Validation result: {}", result.to_string());
if (!result.is_valid) {
logger()->warn("Invalid reading detected: {} (sensor: {})",
result.error_message, reading.sensor_id);
}
} catch (const std::exception& e) {
// Log exceptions with context
logger()->error("Failed to process sensor reading: {} (sensor: {})",
e.what(), reading.sensor_id);
throw;
}
}
// RAII performance logger
class PerfLogger {
nexus::Logger* logger_;
std::string operation_;
std::chrono::steady_clock::time_point start_;
public:
PerfLogger(nexus::Logger* logger, std::string operation)
: logger_(logger), operation_(std::move(operation)),
start_(std::chrono::steady_clock::now()) {}
~PerfLogger() {
auto elapsed = std::chrono::steady_clock::now() - start_;
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(elapsed);
logger_->debug("{} completed in {}ms", operation_, ms.count());
}
};
// Usage example
void complex_operation() {
PerfLogger perf(logger(), "complex_operation");
// Operation code here - timing logged automatically
}
// Conditional debug output
void debug_dump_state() {
if (logger()->is_enabled(nexus::LogLevel::Debug)) {
logger()->debug("Module state: sensors={}, errors={}, uptime={}s",
active_sensors_.size(),
error_count_,
std::chrono::duration_cast<std::chrono::seconds>(
std::chrono::steady_clock::now() - start_time_
).count()
);
}
}
};
classdef TemperatureSensorModule < Module
methods (Access = protected)
function onInitialized(obj)
% Log levels: trace, debug, info, warning, error, fatal
obj.logger.info('Module initialized successfully');
% Structured logging with additional data
config = struct(...
'sensorCount', 5, ...
'samplingRate', 100, ...
'bufferSize', 1000 ...
);
obj.logger.debug('Sensor configuration loaded', config);
% Logging with correlation ID
correlationId = char(java.util.UUID.randomUUID());
obj.logger.info(['Starting calibration [', correlationId, ']']);
obj.calibrateSensors();
obj.logger.info(['Calibration completed [', correlationId, ']']);
end
end
methods (Access = private)
function processSensorData(obj, reading)
% Trace level for detailed debugging
obj.logger.trace(sprintf('Processing reading: sensor=%s, value=%.2f', ...
reading.sensorId, reading.value));
try
result = obj.validateReading(reading);
obj.logger.debug(['Validation result: ', result.toString()]);
if ~result.isValid
obj.logger.warning(sprintf(...
'Invalid reading detected: %s (sensor: %s)', ...
result.errorMessage, reading.sensorId));
end
catch ME
% Log exceptions with stack trace
obj.logger.error(sprintf(...
'Failed to process sensor reading %s: %s', ...
reading.sensorId, ME.message));
obj.logger.debug(getReport(ME, 'extended'));
rethrow(ME);
end
end
% Performance logging helper
function result = logPerformance(obj, operation, func)
tic;
try
result = func();
elapsed = toc * 1000; % Convert to ms
obj.logger.debug(sprintf('%s completed in %.2fms', ...
operation, elapsed));
catch ME
elapsed = toc * 1000;
obj.logger.error(sprintf('%s failed after %.2fms: %s', ...
operation, elapsed, ME.message));
rethrow(ME);
end
end
% Conditional debug logging
function debugDumpState(obj)
if obj.logger.isDebugEnabled()
state = struct(...
'activeSensors', length(obj.sensors), ...
'bufferUsage', obj.buffer.size(), ...
'lastReading', obj.lastReading, ...
'errorCount', obj.errorCount ...
);
obj.logger.debug(['Module state: ', jsonencode(state)]);
end
end
% Log method entry/exit for debugging
function logMethodBoundary(obj, methodName, entering)
if obj.logger.isTraceEnabled()
if entering
obj.logger.trace(['Entering ', methodName]);
else
obj.logger.trace(['Exiting ', methodName]);
end
end
end
end
end
// Logging Best Practices in LabVIEW
//
// Log Levels (use appropriate VIs):
// - Nexus.Log.Trace: Detailed debugging info
// - Nexus.Log.Debug: Debug information
// - Nexus.Log.Info: General information
// - Nexus.Log.Warning: Warning conditions
// - Nexus.Log.Error: Error conditions
// - Nexus.Log.Fatal: Fatal errors
//
// Structured Logging Pattern:
// 1. Create cluster with log data:
// - Message (string)
// - Level (enum)
// - Timestamp (timestamp)
// - Additional Data (variant)
//
// 2. Use Nexus.Log VI with inputs:
// - Log Level
// - Message
// - Data Cluster (optional)
//
// Performance Logging SubVI Pattern:
// Inputs:
// - Operation Name (string)
// - Module Context
// Outputs:
// - Error Out
//
// Implementation:
// 1. Get Tick Count at start
// 2. Place operation code here
// 3. Get Tick Count at end
// 4. Calculate elapsed time
// 5. Log with Nexus.Log.Debug
//
// Exception Logging Pattern:
// In error case structure:
// 1. Get Error Source
// 2. Get Error Code
// 3. Get Error Description
// 4. Bundle into cluster
// 5. Use Nexus.Log.Error VI
//
// Conditional Debug Logging:
// 1. Use Nexus.Logger.IsEnabled VI
// 2. Wire to case structure
// 3. Only execute debug code if enabled
//
// Correlation ID Pattern:
// 1. Generate GUID (LabVIEW built-in)
// 2. Add to log data cluster
// 3. Pass through all related operations
// 4. Include in all log messages
//
// Best Practices:
// - Use Error Cluster for all operations
// - Include context in log messages
// - Log at appropriate levels
// - Avoid logging in tight loops
// - Use conditional logging for performance
Log Levels Guidelines
Level | When to Use | Examples |
---|---|---|
TRACE | Very detailed debugging information | Method entry/exit, variable values, loop iterations |
DEBUG | Debugging information for development | Configuration loaded, state changes, calculation results |
INFO | Important runtime events | Module started, connection established, major operations |
WARNING | Potentially harmful situations | Deprecated features, recoverable errors, performance issues |
ERROR | Error events that don't stop the module | Failed operations, invalid data, connection losses |
FATAL | Severe errors causing module shutdown | Unrecoverable errors, critical resource failures |
Development Tools
IDE Debugging
Visual Studio / VS Code (C#)
- Set breakpoints in module code
- Attach debugger to running NEXUS-1 process
- Use conditional breakpoints for specific scenarios
- Watch message bus activity in debug windows
- Debug async code with Tasks window
PyCharm / VS Code (Python)
- Remote debugging for distributed modules
- Set breakpoints in async handlers
- Use debugger console for runtime inspection
- Profile CPU and memory usage
- Debug with pytest integration
GDB / LLDB (C++)
- Attach to running module processes
- Set watchpoints on critical variables
- Use thread-specific breakpoints
- Analyze core dumps for crashes
- Memory debugging with sanitizers
Remote Debugging
// Enable remote debugging in module manifest
modules:
- name: "temperature-monitor"
type: "process"
executable: "TemperatureMonitor.dll"
environment:
VSDBG_ENABLE: "1"
VSDBG_PORT: "4024"
// In your module code, add debug helpers
public class TemperatureMonitor : ModuleBase
{
protected override void OnInitialized()
{
#if DEBUG
// Wait for debugger attachment
Logger.LogInformation($"Waiting for debugger on PID: {Process.GetCurrentProcess().Id}");
while (!Debugger.IsAttached)
{
Thread.Sleep(100);
}
Debugger.Break();
#endif
// Normal initialization
Initialize();
}
// Conditional debug code
[Conditional("DEBUG")]
private void DebugDumpState()
{
Logger.LogDebug("Current state: {@State}", new
{
ActiveSensors = sensors.Count,
QueueDepth = messageQueue.Count,
LastUpdate = lastUpdateTime
});
}
}
# Enable remote debugging with debugpy
import os
if os.environ.get('ENABLE_DEBUG', '').lower() == 'true':
import debugpy
debugpy.listen(5678)
print(f"Waiting for debugger on port 5678 (PID: {os.getpid()})")
debugpy.wait_for_client()
debugpy.breakpoint()
class TemperatureMonitor(Module):
def on_initialized(self):
# Use pdb for interactive debugging
if self.config.get('debug_mode', False):
import pdb
pdb.set_trace()
# Normal initialization
self.initialize()
# Debug decorator for development
def debug_only(func):
def wrapper(self, *args, **kwargs):
if self.logger.is_debug_enabled():
return func(self, *args, **kwargs)
return wrapper
@debug_only
def dump_internal_state(self):
"""Dump internal state for debugging"""
import pprint
state = {
'active_sensors': len(self.sensors),
'queue_depth': self.message_queue.qsize(),
'last_update': self.last_update_time,
'error_stats': self.error_counter
}
self.logger.debug(f"Internal state:\n{pprint.pformat(state)}")
# Remote inspection endpoint
async def handle_debug_request(self, message):
if message.payload.get('command') == 'dump_state':
return {
'sensors': [s.to_dict() for s in self.sensors],
'stats': self.get_statistics(),
'config': self.config
}
// Enable GDB server for remote debugging
class TemperatureMonitor : public nexus::ModuleBase {
protected:
void on_initialized() override {
#ifdef DEBUG
// Print PID for attaching debugger
logger()->info("Module PID: {} - Ready for debugger", getpid());
// Optional: Wait for debugger
if (std::getenv("WAIT_FOR_DEBUGGER")) {
logger()->info("Waiting for debugger attachment...");
std::raise(SIGSTOP);
}
#endif
// Normal initialization
initialize();
}
private:
// Debug-only methods
#ifdef DEBUG
void dump_internal_state() {
logger()->debug("Internal state dump:");
logger()->debug(" Active sensors: {}", sensors_.size());
logger()->debug(" Message queue depth: {}", message_queue_.size());
logger()->debug(" Memory usage: {} MB", get_memory_usage_mb());
// Dump sensor states
for (const auto& [id, sensor] : sensors_) {
logger()->debug(" Sensor {}: last_value={}, errors={}",
id, sensor.last_value, sensor.error_count);
}
}
// Debug command handler
void handle_debug_command(const std::string& cmd) {
if (cmd == "break") {
__builtin_debugtrap(); // Breakpoint
} else if (cmd == "dump") {
dump_internal_state();
} else if (cmd == "stats") {
dump_performance_stats();
}
}
#endif
// Assertion helpers
void debug_assert(bool condition, const std::string& message) {
#ifdef DEBUG
if (!condition) {
logger()->error("Assertion failed: {}", message);
std::abort();
}
#endif
}
};
Message Bus Debugging
Monitoring Message Traffic
Debug message flow between modules to identify communication issues.
public class MessageDebugger : ModuleBase
{
private readonly ConcurrentDictionary _stats = new();
protected override void OnInitialized()
{
// Subscribe to all messages for debugging
Messages.Subscribe("*", LogMessage);
Messages.Subscribe("**", LogMessage); // Nested topics
// Periodic stats dump
_ = Task.Run(async () =>
{
while (!cancellationToken.IsCancellationRequested)
{
await Task.Delay(TimeSpan.FromSeconds(10));
DumpMessageStats();
}
});
}
private async Task LogMessage(Message message)
{
// Log message details
Logger.LogTrace("Message received: Topic={Topic}, Size={Size}, Timestamp={Timestamp}",
message.Topic,
message.PayloadJson?.Length ?? 0,
message.Timestamp);
// Update statistics
_stats.AddOrUpdate(message.Topic,
new MessageStats { Count = 1, TotalBytes = message.PayloadJson?.Length ?? 0 },
(_, stats) =>
{
stats.Count++;
stats.TotalBytes += message.PayloadJson?.Length ?? 0;
return stats;
});
// Validate message format
try
{
if (!string.IsNullOrEmpty(message.PayloadJson))
{
var payload = JsonSerializer.Deserialize(message.PayloadJson);
ValidateMessageStructure(message.Topic, payload);
}
}
catch (JsonException ex)
{
Logger.LogError(ex, "Invalid JSON in message {Topic}", message.Topic);
}
}
private void DumpMessageStats()
{
var stats = _stats.ToArray()
.OrderByDescending(kvp => kvp.Value.Count)
.Take(10);
Logger.LogInformation("Message Statistics (Top 10):");
foreach (var (topic, stat) in stats)
{
Logger.LogInformation(" {Topic}: {Count} messages, {TotalMB:F2} MB",
topic, stat.Count, stat.TotalBytes / 1024.0 / 1024.0);
}
}
// Message pattern analyzer
private void AnalyzeMessagePatterns()
{
var patterns = _stats.Keys
.GroupBy(topic => topic.Split('.').FirstOrDefault() ?? "unknown")
.Select(g => new { Prefix = g.Key, Topics = g.Count() });
Logger.LogDebug("Message patterns:");
foreach (var pattern in patterns)
{
Logger.LogDebug(" {Prefix}.*: {Count} unique topics",
pattern.Prefix, pattern.Topics);
}
}
}
import json
import asyncio
from collections import defaultdict
from datetime import datetime, timedelta
class MessageDebugger(Module):
def __init__(self):
super().__init__()
self.message_stats = defaultdict(lambda: {'count': 0, 'bytes': 0})
self.message_history = deque(maxlen=1000) # Keep last 1000 messages
def on_initialized(self):
# Subscribe to all messages for debugging
self.subscribe("*", self.log_message)
self.subscribe("**", self.log_message) # Nested topics
# Start statistics reporter
asyncio.create_task(self.report_stats_periodically())
async def log_message(self, message):
# Log message details
self.logger.trace(
f"Message: topic={message.topic}, "
f"size={len(json.dumps(message.payload))}, "
f"timestamp={message.timestamp}"
)
# Store in history
self.message_history.append({
'topic': message.topic,
'timestamp': message.timestamp,
'size': len(json.dumps(message.payload)),
'payload_preview': str(message.payload)[:100]
})
# Update statistics
topic_stats = self.message_stats[message.topic]
topic_stats['count'] += 1
topic_stats['bytes'] += len(json.dumps(message.payload))
topic_stats['last_seen'] = datetime.now()
# Validate message structure
self.validate_message(message)
def validate_message(self, message):
"""Validate message structure and content"""
try:
# Check required fields based on topic pattern
if message.topic.startswith('sensor.'):
assert 'value' in message.payload, "Missing 'value' field"
assert 'sensor_id' in message.payload, "Missing 'sensor_id' field"
elif message.topic.startswith('command.'):
assert 'action' in message.payload, "Missing 'action' field"
except AssertionError as e:
self.logger.warning(
f"Message validation failed for {message.topic}: {e}"
)
async def report_stats_periodically(self):
"""Report message statistics every 10 seconds"""
while True:
await asyncio.sleep(10)
self.dump_message_stats()
def dump_message_stats(self):
"""Dump current message statistics"""
# Sort by message count
sorted_stats = sorted(
self.message_stats.items(),
key=lambda x: x[1]['count'],
reverse=True
)[:10]
self.logger.info("Message Statistics (Top 10):")
for topic, stats in sorted_stats:
mb = stats['bytes'] / 1024 / 1024
self.logger.info(
f" {topic}: {stats['count']} messages, {mb:.2f} MB"
)
# Analyze message patterns
self.analyze_patterns()
def analyze_patterns(self):
"""Analyze message topic patterns"""
patterns = defaultdict(int)
for topic in self.message_stats.keys():
prefix = topic.split('.')[0]
patterns[prefix] += 1
self.logger.debug("Topic patterns:")
for prefix, count in patterns.items():
self.logger.debug(f" {prefix}.*: {count} unique topics")
def get_message_trace(self, correlation_id=None, time_window=60):
"""Get message trace for debugging"""
now = datetime.now()
cutoff = now - timedelta(seconds=time_window)
# Filter messages
messages = [
msg for msg in self.message_history
if msg['timestamp'] > cutoff
]
if correlation_id:
# Filter by correlation ID if provided
messages = [
msg for msg in messages
if correlation_id in msg.get('payload_preview', '')
]
return messages
class MessageDebugger : public nexus::ModuleBase {
private:
struct MessageStats {
std::atomic count{0};
std::atomic total_bytes{0};
std::chrono::steady_clock::time_point last_seen;
};
std::unordered_map stats_;
std::mutex stats_mutex_;
std::deque message_history_;
std::mutex history_mutex_;
protected:
void on_initialized() override {
// Subscribe to all messages
messages()->subscribe("*", [this](const Message& msg) {
log_message(msg);
});
// Start stats reporter
std::thread([this]() {
while (!should_stop()) {
std::this_thread::sleep_for(std::chrono::seconds(10));
dump_stats();
}
}).detach();
}
private:
void log_message(const Message& message) {
// Log message details
logger()->trace("Message: topic={}, size={}, timestamp={}",
message.topic,
message.payload_json.size(),
format_timestamp(message.timestamp));
// Update statistics
{
std::lock_guard lock(stats_mutex_);
auto& stats = stats_[message.topic];
stats.count++;
stats.total_bytes += message.payload_json.size();
stats.last_seen = std::chrono::steady_clock::now();
}
// Store in history
{
std::lock_guard lock(history_mutex_);
if (message_history_.size() >= 1000) {
message_history_.pop_front();
}
message_history_.push_back({
{"topic", message.topic},
{"timestamp", message.timestamp},
{"size", message.payload_json.size()},
{"preview", message.payload_json.substr(0, 100)}
});
}
// Validate message
validate_message(message);
}
void validate_message(const Message& message) {
try {
auto payload = json::parse(message.payload_json);
// Topic-based validation
if (message.topic.starts_with("sensor.")) {
if (!payload.contains("value")) {
logger()->warn("Missing 'value' in sensor message: {}",
message.topic);
}
if (!payload.contains("sensor_id")) {
logger()->warn("Missing 'sensor_id' in sensor message: {}",
message.topic);
}
}
} catch (const json::exception& e) {
logger()->error("Invalid JSON in message {}: {}",
message.topic, e.what());
}
}
void dump_stats() {
std::vector> sorted_stats;
{
std::lock_guard lock(stats_mutex_);
for (const auto& [topic, stats] : stats_) {
sorted_stats.emplace_back(topic, stats);
}
}
// Sort by count
std::sort(sorted_stats.begin(), sorted_stats.end(),
[](const auto& a, const auto& b) {
return a.second.count > b.second.count;
});
logger()->info("Message Statistics (Top 10):");
for (size_t i = 0; i < std::min(size_t(10), sorted_stats.size()); ++i) {
const auto& [topic, stats] = sorted_stats[i];
double mb = stats.total_bytes / 1024.0 / 1024.0;
logger()->info(" {}: {} messages, {:.2f} MB",
topic, stats.count.load(), mb);
}
}
// Message flow analyzer
std::vector analyze_message_flow(const std::string& correlation_id) {
std::vector flow;
std::lock_guard lock(history_mutex_);
for (const auto& msg : message_history_) {
if (msg["preview"].get().find(correlation_id) !=
std::string::npos) {
flow.push_back(msg);
}
}
return flow;
}
};
Message Inspection Tools
- Topic Wildcards: Use "*" for single level, "**" for all nested levels
- Message Filtering: Filter by topic pattern, payload content, or time range
- Performance Metrics: Track message rates, sizes, and latencies
- Correlation Tracking: Follow message flows using correlation IDs
- Dead Letter Analysis: Monitor failed message deliveries
Common Issues and Solutions
Module Startup Failures
Issue | Symptoms | Solution |
---|---|---|
Configuration Error | Module fails to start, config validation errors | Check manifest YAML syntax, verify required fields, validate config schema |
Missing Dependencies | DLL/library not found errors | Ensure all dependencies are in module directory or system path |
Permission Issues | Access denied errors during startup | Check file permissions, ensure module has required capabilities |
Port Conflicts | Bind errors for network modules | Check for port conflicts, use dynamic port allocation |
Resource Limits | Out of memory or file descriptor errors | Increase resource limits in manifest, optimize resource usage |
Message Delivery Problems
// Debugging message delivery issues
public class DeliveryDebugger : ModuleBase
{
// Track sent messages
private readonly ConcurrentDictionary _sentMessages = new();
private async Task PublishWithTracking(string topic, object payload)
{
var messageId = Guid.NewGuid().ToString();
var payloadWithId = new
{
_messageId = messageId,
_timestamp = DateTime.UtcNow,
data = payload
};
_sentMessages[messageId] = DateTime.UtcNow;
try
{
await Messages.PublishAsync(topic, payloadWithId);
Logger.LogDebug("Message {MessageId} published to {Topic}",
messageId, topic);
}
catch (Exception ex)
{
Logger.LogError(ex, "Failed to publish {MessageId} to {Topic}",
messageId, topic);
throw;
}
}
// Verify message delivery
private async Task VerifyDelivery(string messageId, TimeSpan timeout)
{
var deadline = DateTime.UtcNow.Add(timeout);
while (DateTime.UtcNow < deadline)
{
if (ReceivedMessages.Contains(messageId))
{
var latency = DateTime.UtcNow - _sentMessages[messageId];
Logger.LogDebug("Message {MessageId} delivered in {Latency}ms",
messageId, latency.TotalMilliseconds);
return true;
}
await Task.Delay(10);
}
Logger.LogWarning("Message {MessageId} not delivered within {Timeout}",
messageId, timeout);
return false;
}
// Debug subscription issues
private void DebugSubscriptions()
{
var subscriptions = Messages.GetActiveSubscriptions();
Logger.LogDebug("Active subscriptions:");
foreach (var sub in subscriptions)
{
Logger.LogDebug(" {Topic}: {HandlerCount} handlers",
sub.Topic, sub.HandlerCount);
}
}
}
# Debugging message delivery issues
class DeliveryDebugger(Module):
def __init__(self):
super().__init__()
self.sent_messages = {}
self.received_messages = set()
async def publish_with_tracking(self, topic, payload):
"""Publish message with delivery tracking"""
message_id = str(uuid.uuid4())
timestamp = datetime.now()
# Add tracking info
tracked_payload = {
'_message_id': message_id,
'_timestamp': timestamp.isoformat(),
'data': payload
}
self.sent_messages[message_id] = timestamp
try:
await self.publish(topic, tracked_payload)
self.logger.debug(f"Message {message_id} published to {topic}")
except Exception as e:
self.logger.error(
f"Failed to publish {message_id} to {topic}: {e}"
)
raise
async def verify_delivery(self, message_id, timeout=5.0):
"""Verify message was delivered"""
start_time = asyncio.get_event_loop().time()
while asyncio.get_event_loop().time() - start_time < timeout:
if message_id in self.received_messages:
sent_time = self.sent_messages.get(message_id)
if sent_time:
latency = (datetime.now() - sent_time).total_seconds() * 1000
self.logger.debug(
f"Message {message_id} delivered in {latency:.2f}ms"
)
return True
await asyncio.sleep(0.01)
self.logger.warning(
f"Message {message_id} not delivered within {timeout}s"
)
return False
def debug_subscriptions(self):
"""Debug current subscriptions"""
subs = self.get_active_subscriptions()
self.logger.debug("Active subscriptions:")
for topic, handlers in subs.items():
self.logger.debug(f" {topic}: {len(handlers)} handlers")
async def test_message_patterns(self):
"""Test various message patterns"""
test_cases = [
("exact.topic", "exact.topic", True),
("wildcard.*", "wildcard.test", True),
("nested.**", "nested.level1.level2", True),
("wrong.topic", "different.topic", False)
]
for subscription, publish_topic, should_receive in test_cases:
received = []
async def handler(msg):
received.append(msg)
await self.subscribe(subscription, handler)
await self.publish(publish_topic, {"test": True})
await asyncio.sleep(0.1)
if should_receive and not received:
self.logger.error(
f"Pattern test failed: {subscription} should receive {publish_topic}"
)
elif not should_receive and received:
self.logger.error(
f"Pattern test failed: {subscription} should NOT receive {publish_topic}"
)
Performance Bottlenecks
Identifying Bottlenecks
- Profile CPU usage during peak load
- Monitor memory allocation patterns
- Track message processing latencies
- Analyze thread/async task utilization
- Check for lock contention
Common Performance Issues
- Synchronous I/O: Use async operations for all I/O
- Unbounded Queues: Set limits on message queues
- Large Message Payloads: Compress or stream large data
- Inefficient Serialization: Use binary formats for high-frequency data
- Memory Leaks: Properly dispose resources and clear caches
Memory Leak Detection
// C# Memory leak detection helpers
public class MemoryMonitor : ModuleBase
{
private readonly Timer _monitorTimer;
private long _lastGen2Count;
private long _lastTotalMemory;
public MemoryMonitor()
{
_monitorTimer = new Timer(CheckMemory, null,
TimeSpan.FromMinutes(1), TimeSpan.FromMinutes(1));
}
private void CheckMemory(object state)
{
var currentGen2 = GC.CollectionCount(2);
var currentMemory = GC.GetTotalMemory(false);
if (currentGen2 > _lastGen2Count)
{
Logger.LogDebug("Gen2 GC occurred. Count: {Count}, Memory: {Memory:N0} bytes",
currentGen2, currentMemory);
}
if (currentMemory > _lastTotalMemory * 1.5)
{
Logger.LogWarning("Memory usage increased significantly: {Previous:N0} -> {Current:N0}",
_lastTotalMemory, currentMemory);
// Force GC and check again
GC.Collect();
GC.WaitForPendingFinalizers();
GC.Collect();
var afterGC = GC.GetTotalMemory(false);
Logger.LogDebug("Memory after forced GC: {Memory:N0}", afterGC);
}
_lastGen2Count = currentGen2;
_lastTotalMemory = currentMemory;
}
}
Testing in Development
Test Harness Setup
Create isolated test environments for module development.
// Example test harness for module development
public class ModuleTestHarness
{
private readonly TestMessageBus _messageBus;
private readonly IModule _moduleUnderTest;
public ModuleTestHarness()
{
_messageBus = new TestMessageBus();
var context = new TestModuleContext(_messageBus);
_moduleUnderTest = new TemperatureMonitor(context);
}
public async Task RunTest()
{
// Initialize module
await _moduleUnderTest.StartAsync();
// Simulate sensor data
await _messageBus.SimulateMessage("sensor.temperature", new
{
sensor_id = "sensor-1",
value = 25.5,
timestamp = DateTime.UtcNow
});
// Wait for processing
await Task.Delay(100);
// Verify output
var messages = _messageBus.GetPublishedMessages("temperature.processed");
Assert.Single(messages);
Assert.Equal(25.5, messages[0].GetPayload().value);
}
}
// Test message bus implementation
public class TestMessageBus : IMessageClient
{
private readonly List _published = new();
private readonly Dictionary>> _handlers = new();
public async Task PublishAsync(string topic, object payload)
{
_published.Add(new PublishedMessage(topic, payload));
// Deliver to subscribers
if (_handlers.TryGetValue(topic, out var handlers))
{
foreach (var handler in handlers)
{
await handler(new Message(topic, JsonSerializer.Serialize(payload)));
}
}
}
public ISubscription Subscribe(string pattern, Func handler)
{
if (!_handlers.ContainsKey(pattern))
_handlers[pattern] = new List>();
_handlers[pattern].Add(handler);
return new TestSubscription(() => _handlers[pattern].Remove(handler));
}
public List GetPublishedMessages(string topic)
{
return _published.Where(m => m.Topic == topic).ToList();
}
}
Debugging Tips
- Start Simple: Test basic functionality before complex scenarios
- Use Logging Liberally: Add detailed logs during development
- Test Edge Cases: Invalid inputs, timeouts, resource exhaustion
- Monitor Resources: Track memory, CPU, and handle usage
- Simulate Failures: Test error handling and recovery
- Use Correlation IDs: Track requests across module boundaries
- Profile Early: Identify performance issues during development
- Document Issues: Keep notes on resolved problems for future reference
Exception Handling
Robust exception handling is critical for building reliable NEXUS-1 modules. The SDK provides a comprehensive exception hierarchy and patterns for graceful error recovery.
Exception Hierarchy
SDK Exception Types
The NEXUS-1 SDK defines specific exception types for different error scenarios:
- NexusException: Base exception for all SDK-related errors
- ModuleException: Module lifecycle and operation errors
- MessageBusException: Message publishing/subscription failures
- ConfigurationException: Configuration loading or validation errors
- SecurityException: Authentication, authorization, or capability failures
- TimeoutException: Operation timeout errors
- NotInitializedException: Module used before initialization
Exception Properties
Rich Error Context
SDK exceptions provide detailed context for effective debugging and monitoring.
// NexusException properties
public class NexusException : Exception
{
public string ErrorCode { get; } // Unique error identifier
public ErrorSeverity Severity { get; } // Error severity level
public Dictionary Context { get; } // Additional context
// Add context fluently
public NexusException WithContext(string key, object value);
}
// Error severity levels
public enum ErrorSeverity
{
Info, // Informational, can be ignored
Warning, // Should be investigated
Error, // Must be handled
Critical // Immediate action required
}
// Usage example
try
{
await ProcessDataAsync(data);
}
catch (ModuleException ex)
{
Logger.LogError(ex, "Module error: {ErrorCode}", ex.ErrorCode);
// Access rich context
foreach (var (key, value) in ex.Context)
{
Logger.LogError(" {Key}: {Value}", key, value);
}
// Check severity
if (ex.Severity == ErrorSeverity.Critical)
{
await EmergencyShutdownAsync();
}
}
# Exception properties
class NexusException(Exception):
def __init__(self, message: str, error_code: str = None,
severity: ErrorSeverity = ErrorSeverity.ERROR):
super().__init__(message)
self.error_code = error_code or "NEXUS_ERROR"
self.severity = severity
self.context = {}
def with_context(self, key: str, value: Any) -> 'NexusException':
"""Add context fluently"""
self.context[key] = value
return self
# Error severity levels
class ErrorSeverity(Enum):
INFO = "info" # Informational
WARNING = "warning" # Should be investigated
ERROR = "error" # Must be handled
CRITICAL = "critical" # Immediate action required
# Usage example
try:
await self.process_data(data)
except ModuleException as ex:
self.logger.error(f"Module error: {ex.error_code}", exc_info=ex)
# Access rich context
for key, value in ex.context.items():
self.logger.error(f" {key}: {value}")
# Check severity
if ex.severity == ErrorSeverity.CRITICAL:
await self.emergency_shutdown()
// Exception hierarchy
class NexusException : public std::exception {
public:
explicit NexusException(const std::string& message,
const std::string& error_code = "NEXUS_ERROR",
ErrorSeverity severity = ErrorSeverity::Error)
: message_(message), error_code_(error_code), severity_(severity) {}
const char* what() const noexcept override { return message_.c_str(); }
const std::string& error_code() const { return error_code_; }
ErrorSeverity severity() const { return severity_; }
// Add context
NexusException& with_context(const std::string& key, const std::string& value) {
context_[key] = value;
return *this;
}
const std::map& context() const { return context_; }
private:
std::string message_;
std::string error_code_;
ErrorSeverity severity_;
std::map context_;
};
// Usage example
try {
co_await ProcessDataAsync(data);
}
catch (const ModuleException& ex) {
logger->error("Module error: {}", ex.error_code());
// Access rich context
for (const auto& [key, value] : ex.context()) {
logger->error(" {}: {}", key, value);
}
// Check severity
if (ex.severity() == ErrorSeverity::Critical) {
co_await EmergencyShutdownAsync();
}
}
Exception Handling Patterns
Basic Try-Catch Pattern
Handle exceptions at appropriate boundaries with proper logging and recovery.
public class DataProcessorModule : ModuleBase
{
protected override async Task OnStart()
{
try
{
// Initialize resources
await InitializeResourcesAsync();
// Start processing
await StartProcessingAsync();
}
catch (ConfigurationException ex)
{
// Handle configuration errors specifically
Logger.LogError(ex, "Configuration error: {Key}", ex.ConfigurationKey);
// Try default configuration
await UseDefaultConfigurationAsync();
}
catch (SecurityException ex)
{
// Security errors are critical - don't continue
Logger.LogCritical(ex, "Security violation: {Context}", ex.SecurityContext);
throw; // Re-throw to prevent module start
}
catch (Exception ex)
{
// Catch-all for unexpected errors
Logger.LogError(ex, "Unexpected error during module start");
// Wrap in module exception with context
throw new ModuleException("Failed to start data processor", ex)
.WithContext("ModuleId", Id)
.WithContext("StartTime", DateTime.UtcNow);
}
}
private async Task ProcessMessage(Message message)
{
var retryCount = 0;
const int maxRetries = 3;
while (retryCount < maxRetries)
{
try
{
await ProcessDataAsync(message.Payload);
break; // Success
}
catch (TimeoutException ex) when (retryCount < maxRetries - 1)
{
// Retry on timeout
retryCount++;
Logger.LogWarning(ex, "Timeout processing message, retry {Count}/{Max}",
retryCount, maxRetries);
await Task.Delay(TimeSpan.FromSeconds(Math.Pow(2, retryCount)));
}
catch (MessageBusException ex)
{
// Don't retry message bus errors
Logger.LogError(ex, "Message bus error, skipping message");
await message.NackAsync(requeue: false);
return;
}
}
}
}
class DataProcessorModule(ModuleBase):
async def on_start(self):
try:
# Initialize resources
await self.initialize_resources()
# Start processing
await self.start_processing()
except ConfigurationException as ex:
# Handle configuration errors specifically
self.logger.error(f"Configuration error: {ex.configuration_key}", exc_info=ex)
# Try default configuration
await self.use_default_configuration()
except SecurityException as ex:
# Security errors are critical - don't continue
self.logger.critical(f"Security violation: {ex.security_context}", exc_info=ex)
raise # Re-throw to prevent module start
except Exception as ex:
# Catch-all for unexpected errors
self.logger.error("Unexpected error during module start", exc_info=ex)
# Wrap in module exception with context
raise ModuleException("Failed to start data processor", ex) \
.with_context("module_id", self.id) \
.with_context("start_time", datetime.utcnow())
async def process_message(self, message: Message):
retry_count = 0
max_retries = 3
while retry_count < max_retries:
try:
await self.process_data(message.payload)
break # Success
except TimeoutException as ex:
if retry_count < max_retries - 1:
# Retry on timeout
retry_count += 1
self.logger.warning(
f"Timeout processing message, retry {retry_count}/{max_retries}",
exc_info=ex
)
await asyncio.sleep(2 ** retry_count)
else:
raise
except MessageBusException as ex:
# Don't retry message bus errors
self.logger.error("Message bus error, skipping message", exc_info=ex)
await message.nack(requeue=False)
return
class DataProcessorModule : public ModuleBase {
protected:
void OnStart() override {
try {
// Initialize resources
InitializeResources();
// Start processing
StartProcessing();
}
catch (const ConfigurationException& ex) {
// Handle configuration errors specifically
logger_->error("Configuration error: {}", ex.configuration_key());
// Try default configuration
UseDefaultConfiguration();
}
catch (const SecurityException& ex) {
// Security errors are critical - don't continue
logger_->critical("Security violation: {}", ex.security_context());
throw; // Re-throw to prevent module start
}
catch (const std::exception& ex) {
// Catch-all for unexpected errors
logger_->error("Unexpected error during module start: {}", ex.what());
// Wrap in module exception with context
throw ModuleException("Failed to start data processor")
.with_context("module_id", GetId())
.with_context("start_time", std::chrono::system_clock::now());
}
}
async_task ProcessMessage(std::shared_ptr message) {
int retry_count = 0;
const int max_retries = 3;
while (retry_count < max_retries) {
try {
co_await ProcessDataAsync(message->payload());
break; // Success
}
catch (const TimeoutException& ex) {
if (retry_count < max_retries - 1) {
// Retry on timeout
retry_count++;
logger_->warn("Timeout processing message, retry {}/{}",
retry_count, max_retries);
co_await async_sleep(std::chrono::seconds(1 << retry_count));
} else {
throw;
}
}
catch (const MessageBusException& ex) {
// Don't retry message bus errors
logger_->error("Message bus error, skipping message: {}", ex.what());
co_await message->nack_async(false);
co_return;
}
}
}
};
classdef DataProcessorModule < nexus.Module
methods (Access = protected)
function onStart(obj)
try
% Initialize resources
obj.initializeResources();
% Start processing
obj.startProcessing();
catch ME
if strcmp(ME.identifier, 'Nexus:ConfigurationException')
% Handle configuration errors specifically
obj.Logger.error('Configuration error', ME);
% Try default configuration
obj.useDefaultConfiguration();
elseif strcmp(ME.identifier, 'Nexus:SecurityException')
% Security errors are critical - don't continue
obj.Logger.critical('Security violation', ME);
rethrow(ME); % Re-throw to prevent module start
else
% Catch-all for unexpected errors
obj.Logger.error('Unexpected error during module start', ME);
% Wrap in module exception with context
moduleError = MException('Nexus:ModuleException', ...
'Failed to start data processor');
moduleError = addCause(moduleError, ME);
throw(moduleError);
end
end
end
function processMessage(obj, message)
retryCount = 0;
maxRetries = 3;
while retryCount < maxRetries
try
obj.processData(message.payload);
break; % Success
catch ME
if strcmp(ME.identifier, 'Nexus:TimeoutException') && ...
retryCount < maxRetries - 1
% Retry on timeout
retryCount = retryCount + 1;
obj.Logger.warning(sprintf(...
'Timeout processing message, retry %d/%d', ...
retryCount, maxRetries));
pause(2^retryCount);
elseif strcmp(ME.identifier, 'Nexus:MessageBusException')
% Don't retry message bus errors
obj.Logger.error('Message bus error, skipping message', ME);
message.nack(false);
return;
else
rethrow(ME);
end
end
end
end
end
end
Global Exception Handler
Implement module-wide exception handling for unhandled errors.
public abstract class ResilientModuleBase : ModuleBase
{
protected ResilientModuleBase()
{
// Handle unhandled exceptions in async contexts
TaskScheduler.UnobservedTaskException += OnUnobservedTaskException;
AppDomain.CurrentDomain.UnhandledException += OnUnhandledException;
}
private void OnUnobservedTaskException(object sender, UnobservedTaskExceptionEventArgs e)
{
var exception = e.Exception.Flatten();
foreach (var ex in exception.InnerExceptions)
{
Logger.LogError(ex, "Unobserved task exception");
// Handle specific exceptions
if (ex is SecurityException)
{
// Security exceptions require immediate action
_ = Task.Run(async () => await HandleSecurityBreachAsync(ex));
}
}
// Mark as observed to prevent process termination
e.SetObserved();
// Record metric
Metrics.Counter("unhandled_exceptions", 1,
new[] { "type", exception.GetType().Name });
}
private void OnUnhandledException(object sender, UnhandledExceptionEventArgs e)
{
if (e.ExceptionObject is Exception ex)
{
Logger.LogCritical(ex, "Unhandled exception, terminating: {IsTerminating}",
e.IsTerminating);
// Last chance to save state
try
{
SaveEmergencyStateAsync().Wait(TimeSpan.FromSeconds(5));
}
catch
{
// Best effort - don't throw in exception handler
}
}
}
protected virtual async Task SaveEmergencyStateAsync()
{
// Override to save critical state before termination
await Task.CompletedTask;
}
}
Exception Aggregation Pattern
Handle multiple exceptions when processing batches or parallel operations.
public async Task ProcessBatchWithErrorHandling(List items)
{
var exceptions = new ConcurrentBag();
var successCount = 0;
var failureCount = 0;
// Process items in parallel with error handling
await Parallel.ForEachAsync(items, new ParallelOptions
{
MaxDegreeOfParallelism = 10,
CancellationToken = CancellationToken
},
async (item, ct) =>
{
try
{
await ProcessItemAsync(item);
Interlocked.Increment(ref successCount);
}
catch (Exception ex)
{
Interlocked.Increment(ref failureCount);
// Collect exceptions for analysis
exceptions.Add(new ProcessingException($"Failed to process item {item.Id}", ex)
.WithContext("ItemId", item.Id)
.WithContext("ItemType", item.Type));
// Log individual failure
Logger.LogError(ex, "Failed to process item {ItemId}", item.Id);
}
});
// Analyze results
if (exceptions.Any())
{
// Group exceptions by type
var exceptionGroups = exceptions
.GroupBy(e => e.GetType().Name)
.Select(g => new { Type = g.Key, Count = g.Count() });
foreach (var group in exceptionGroups)
{
Logger.LogWarning("Exception type {Type} occurred {Count} times",
group.Type, group.Count);
}
// Decide on action based on failure rate
var failureRate = (double)failureCount / items.Count;
if (failureRate > 0.5)
{
// More than 50% failed - this is critical
throw new AggregateException(
$"Batch processing failed: {failureCount}/{items.Count} items failed",
exceptions);
}
else if (failureRate > 0.1)
{
// More than 10% failed - warning but continue
Logger.LogWarning("Batch processing completed with {Rate:P} failure rate",
failureRate);
}
}
// Record metrics
Metrics.Counter("batch.processed", successCount, new[] { "status", "success" });
Metrics.Counter("batch.processed", failureCount, new[] { "status", "failure" });
}
Circuit Breaker Exception Pattern
Prevent cascading failures by breaking the circuit on repeated exceptions.
class CircuitBreakerMixin:
def __init__(self):
self._failure_count = 0
self._last_failure_time = None
self._circuit_state = "closed" # closed, open, half-open
self._failure_threshold = 5
self._timeout_duration = timedelta(seconds=60)
self._success_threshold = 3
self._consecutive_successes = 0
async def with_circuit_breaker(self, operation, *args, **kwargs):
"""Execute operation with circuit breaker protection"""
# Check circuit state
if self._circuit_state == "open":
if datetime.utcnow() - self._last_failure_time < self._timeout_duration:
raise CircuitOpenException(
"Circuit breaker is open, operation blocked"
).with_context("last_failure", self._last_failure_time) \
.with_context("timeout_duration", self._timeout_duration)
else:
# Try half-open state
self._circuit_state = "half-open"
self.logger.info("Circuit breaker entering half-open state")
try:
# Execute operation
result = await operation(*args, **kwargs)
# Handle success
if self._circuit_state == "half-open":
self._consecutive_successes += 1
if self._consecutive_successes >= self._success_threshold:
self._circuit_state = "closed"
self._failure_count = 0
self._consecutive_successes = 0
self.logger.info("Circuit breaker closed after successful recovery")
return result
except Exception as ex:
# Handle failure
self._failure_count += 1
self._last_failure_time = datetime.utcnow()
self._consecutive_successes = 0
if self._failure_count >= self._failure_threshold:
self._circuit_state = "open"
self.logger.error(
f"Circuit breaker opened after {self._failure_count} failures"
)
# Emit alert
await self.message_bus.publish(
CircuitBreakerOpenedEvent(
module_id=self.id,
operation_name=operation.__name__,
failure_count=self._failure_count,
last_exception=str(ex)
)
)
# Add circuit breaker context to exception
if hasattr(ex, 'with_context'):
ex.with_context("circuit_state", self._circuit_state) \
.with_context("failure_count", self._failure_count)
raise
# Usage
class DataService(ModuleBase, CircuitBreakerMixin):
def __init__(self):
super().__init__()
CircuitBreakerMixin.__init__(self)
async def fetch_data(self, url: str):
return await self.with_circuit_breaker(
self._actual_fetch, url
)
async def _actual_fetch(self, url: str):
# Actual implementation that might fail
async with aiohttp.ClientSession() as session:
async with session.get(url, timeout=30) as response:
return await response.json()
Custom Exception Types
Creating Domain-Specific Exceptions
Define custom exceptions for your module's specific error conditions.
// Domain-specific exceptions for a trading module
public class TradingException : ModuleException
{
public TradingException(string message, Exception? innerException = null)
: base(message, innerException, "TRADING_ERROR", ErrorSeverity.Error)
{
}
}
public class InsufficientBalanceException : TradingException
{
public decimal RequiredBalance { get; }
public decimal AvailableBalance { get; }
public string Currency { get; }
public InsufficientBalanceException(
decimal required,
decimal available,
string currency)
: base($"Insufficient balance: required {required} {currency}, available {available} {currency}")
{
RequiredBalance = required;
AvailableBalance = available;
Currency = currency;
WithContext("RequiredBalance", required)
.WithContext("AvailableBalance", available)
.WithContext("Currency", currency)
.WithContext("Shortfall", required - available);
}
}
public class MarketClosedException : TradingException
{
public string Market { get; }
public TimeSpan? OpensIn { get; }
public MarketClosedException(string market, DateTime? nextOpenTime = null)
: base($"Market {market} is closed")
{
Market = market;
if (nextOpenTime.HasValue)
{
OpensIn = nextOpenTime.Value - DateTime.UtcNow;
WithContext("OpensIn", OpensIn);
WithContext("NextOpenTime", nextOpenTime.Value);
}
WithContext("Market", market);
}
}
// Usage
public async Task PlaceOrder(Order order)
{
// Check market hours
if (!IsMarketOpen(order.Symbol))
{
var nextOpen = GetNextMarketOpen(order.Symbol);
throw new MarketClosedException(order.Symbol, nextOpen);
}
// Check balance
var balance = await GetBalance(order.Currency);
var required = order.Quantity * order.Price;
if (balance < required)
{
throw new InsufficientBalanceException(required, balance, order.Currency);
}
// Place order...
}
Exception Best Practices
Exception Handling Guidelines
- Be Specific: Catch specific exceptions before general ones
- Add Context: Use WithContext() to add debugging information
- Log Appropriately: Match log level to exception severity
- Don't Swallow: Never catch and ignore exceptions silently
- Fail Fast: Let critical exceptions bubble up
- Clean Resources: Use finally blocks or using statements
- Avoid Exception Flow: Don't use exceptions for control flow
- Document Exceptions: Document what exceptions methods can throw
- Test Error Paths: Unit test exception scenarios
- Monitor Exceptions: Track exception metrics and patterns
⚠️ Common Pitfalls
- Catching Exception or BaseException too broadly
- Throwing exceptions in destructors or Dispose methods
- Not preserving stack traces when re-throwing
- Creating exceptions is expensive - avoid in hot paths
- Async void methods can't be caught - use async Task
- Don't throw from within catch blocks without wrapping
Audit Logging
Audit logging is essential for compliance, security monitoring, and forensic analysis in industrial systems. This section covers how to implement audit trails in your NEXUS-1 modules.
High-Level Audit API
Simplified Audit Logging
The NEXUS-1 SDK includes a high-level audit API that dramatically simplifies audit logging with fluent methods, automatic context capture, and built-in compliance support:
// C# - High-Level Audit API
public class ControlModule : ModuleBase
{
private IAudit Audit => Context.Audit;
public override async Task InitializeAsync(IModuleContext context)
{
await base.InitializeAsync(context);
// Log module initialization
await Audit.SystemEvent("ModuleInitialized", "Control module started")
.ByUser("system")
.WithResult(AuditResult.Success)
.RecordAsync();
}
// Configuration change
public async Task UpdateConfiguration(string key, object oldValue, object newValue, string userId)
{
await Audit.ConfigurationChanged(key, oldValue, newValue)
.ByUser(userId)
.ForCompliance(ComplianceStandard.ISO27001, "A.12.1.2")
.RecordAsync();
}
// Control command
public async Task ExecuteCommand(string deviceId, string command, object parameters, string userId)
{
var builder = Audit.ControlCommand(deviceId, command, parameters)
.ByUser(userId)
.WithSeverity(AuditSeverity.High);
try
{
// Execute the command
await PerformCommand(deviceId, command, parameters);
await builder.WithResult(AuditResult.Success).RecordAsync();
}
catch (Exception ex)
{
await builder.WithResult(AuditResult.Failure, ex.Message)
.RequiresReview("Command execution failed")
.RecordAsync();
throw;
}
}
// Data access
public async Task ReadSensorData(string sensorId, string userId)
{
await Audit.DataAccessed("SensorReading", sensorId)
.ByUser(userId)
.WithProperty("purpose", "monitoring")
.RecordAsync();
return await GetSensorData(sensorId);
}
// Security event
public async Task HandleFailedLogin(string username, string ipAddress)
{
await Audit.SecurityEvent("LoginFailed", $"Failed login attempt for {username}")
.WithProperty("username", username)
.WithProperty("ipAddress", ipAddress)
.WithTags("authentication", "failed")
.RecordAsync();
}
}
# Python - High-Level Audit API
from nexus_sdk import Module, AuditResult, AuditSeverity, ComplianceStandard
class ControlModule(Module):
@property
def audit(self):
return self.context.audit
async def initialize(self):
await super().initialize()
# Log module initialization
await self.audit.system_event("ModuleInitialized", "Control module started") \
.by_user("system") \
.with_result(AuditResult.SUCCESS) \
.record_async()
# Configuration change
async def update_configuration(self, key: str, old_value, new_value, user_id: str):
await self.audit.configuration_changed(key, old_value, new_value) \
.by_user(user_id) \
.for_compliance(ComplianceStandard.ISO27001, "A.12.1.2") \
.record_async()
# Control command
async def execute_command(self, device_id: str, command: str, parameters: dict, user_id: str):
builder = self.audit.control_command(device_id, command, parameters) \
.by_user(user_id) \
.with_severity(AuditSeverity.HIGH)
try:
# Execute the command
await self.perform_command(device_id, command, parameters)
await builder.with_result(AuditResult.SUCCESS).record_async()
except Exception as ex:
await builder.with_result(AuditResult.FAILURE, str(ex)) \
.requires_review("Command execution failed") \
.record_async()
raise
# Data access
async def read_sensor_data(self, sensor_id: str, user_id: str):
await self.audit.data_accessed("SensorReading", sensor_id) \
.by_user(user_id) \
.with_property("purpose", "monitoring") \
.record_async()
return await self.get_sensor_data(sensor_id)
# Security event
async def handle_failed_login(self, username: str, ip_address: str):
await self.audit.security_event("LoginFailed", f"Failed login attempt for {username}") \
.with_property("username", username) \
.with_property("ip_address", ip_address) \
.with_tags("authentication", "failed") \
.record_async()
// C++ - High-Level Audit API
#include <nexus_sdk/audit.hpp>
class ControlModule : public ModuleBase {
private:
std::shared_ptr<IAudit> audit() { return context()->audit(); }
public:
async_task<void> initialize_async(std::shared_ptr<ModuleContext> ctx) override {
co_await ModuleBase::initialize_async(ctx);
// Log module initialization
co_await audit()->system_event("ModuleInitialized", "Control module started")
->by_user("system")
->with_result(AuditResult::Success)
->record_async();
}
// Configuration change
async_task<void> update_configuration(
const std::string& key,
const std::any& old_value,
const std::any& new_value,
const std::string& user_id) {
co_await audit()->configuration_changed(key, old_value, new_value)
->by_user(user_id)
->for_compliance(ComplianceStandard::ISO27001, "A.12.1.2")
->record_async();
}
// Control command
async_task<void> execute_command(
const std::string& device_id,
const std::string& command,
const std::any& parameters,
const std::string& user_id) {
auto builder = audit()->control_command(device_id, command, parameters)
->by_user(user_id)
->with_severity(AuditSeverity::High);
try {
// Execute the command
co_await perform_command(device_id, command, parameters);
co_await builder->with_result(AuditResult::Success)->record_async();
} catch (const std::exception& ex) {
co_await builder->with_result(AuditResult::Failure, ex.what())
->requires_review("Command execution failed")
->record_async();
throw;
}
}
// Data access
async_task<SensorData> read_sensor_data(
const std::string& sensor_id,
const std::string& user_id) {
co_await audit()->data_accessed("SensorReading", sensor_id)
->by_user(user_id)
->with_property("purpose", "monitoring")
->record_async();
co_return co_await get_sensor_data(sensor_id);
}
// Security event
async_task<void> handle_failed_login(
const std::string& username,
const std::string& ip_address) {
co_await audit()->security_event(
"LoginFailed",
"Failed login attempt for " + username)
->with_property("username", username)
->with_property("ip_address", ip_address)
->with_tags({"authentication", "failed"})
->record_async();
}
};
% MATLAB - High-Level Audit API
classdef ControlModule < nexus.Module
methods
function onInitialize(obj)
% Log module initialization
obj.audit().systemEvent("ModuleInitialized", "Control module started") ...
.byUser("system") ...
.withResult(nexus.AuditResult.Success) ...
.record();
end
function updateConfiguration(obj, key, oldValue, newValue, userId)
% Configuration change
obj.audit().configurationChanged(key, oldValue, newValue) ...
.byUser(userId) ...
.forCompliance(nexus.ComplianceStandard.ISO27001, "A.12.1.2") ...
.record();
end
function executeCommand(obj, deviceId, command, parameters, userId)
% Control command
builder = obj.audit().controlCommand(deviceId, command, parameters) ...
.byUser(userId) ...
.withSeverity(nexus.AuditSeverity.High);
try
% Execute the command
obj.performCommand(deviceId, command, parameters);
builder.withResult(nexus.AuditResult.Success).record();
catch ex
builder.withResult(nexus.AuditResult.Failure, ex.message) ...
.requiresReview("Command execution failed") ...
.record();
rethrow(ex);
end
end
function data = readSensorData(obj, sensorId, userId)
% Data access
obj.audit().dataAccessed("SensorReading", sensorId) ...
.byUser(userId) ...
.withProperty("purpose", "monitoring") ...
.record();
data = obj.getSensorData(sensorId);
end
function handleFailedLogin(obj, username, ipAddress)
% Security event
obj.audit().securityEvent("LoginFailed", ...
sprintf("Failed login attempt for %s", username)) ...
.withProperty("username", username) ...
.withProperty("ipAddress", ipAddress) ...
.withTags("authentication", "failed") ...
.record();
end
end
end
// LabVIEW - High-Level Audit API
// Note: LabVIEW uses graphical programming. Below are the VI descriptions:
// Initialize.vi Override
1. Get Audit Logger from Context
2. System Event.vi
- Event Type: "ModuleInitialized"
- Details: "Control module started"
3. By User.vi
- User ID: "system"
4. With Result.vi
- Result: Success
5. Record.vi
// Update Configuration.vi
Inputs: Key (String), Old Value (Variant), New Value (Variant), User ID (String)
1. Configuration Changed.vi
- Config Key: Key input
- Old Value: Old Value input
- New Value: New Value input
2. By User.vi
- User ID: User ID input
3. For Compliance.vi
- Standard: ISO27001
- Requirement: "A.12.1.2"
4. Record.vi
// Execute Command.vi
Inputs: Device ID, Command, Parameters, User ID
1. Control Command.vi
- Device ID: Device ID input
- Command: Command input
- Parameters: Parameters input
2. By User.vi
- User ID: User ID input
3. With Severity.vi
- Severity: High
4. Try-Catch Structure:
- Try: Execute command, then With Result.vi (Success) → Record.vi
- Catch: With Result.vi (Failure, error message) →
Requires Review.vi ("Command execution failed") → Record.vi
// Read Sensor Data.vi
Inputs: Sensor ID, User ID
1. Data Accessed.vi
- Resource Type: "SensorReading"
- Resource ID: Sensor ID input
2. By User.vi
- User ID: User ID input
3. With Property.vi
- Key: "purpose"
- Value: "monitoring"
4. Record.vi
5. Get Sensor Data (return value)
// Handle Failed Login.vi
Inputs: Username, IP Address
1. Security Event.vi
- Event Type: "LoginFailed"
- Details: Format into string "Failed login attempt for [username]"
2. With Property.vi
- Key: "username"
- Value: Username input
3. With Property.vi
- Key: "ipAddress"
- Value: IP Address input
4. With Tags.vi
- Tags: ["authentication", "failed"]
5. Record.vi
Available Audit Event Types
Pre-defined Audit Methods
The SDK provides specialized methods for common audit scenarios:
- ConfigurationChanged: Track changes to module or system configuration
- DataAccessed: Record when sensitive data is read
- DataModified: Log data changes with before/after values
- DataDeleted: Track data deletion operations
- ControlCommand: Audit control system commands and parameters
- SecurityEvent: Log security-relevant events (login attempts, access denied, etc.)
- SystemEvent: Record system-level events (startup, shutdown, errors)
- Custom: Create custom audit events for specific needs
Fluent Builder Methods
Enhance Audit Events with Context
Chain these methods to add important context to your audit events:
User Context
ByUser(userId, userName?)
- Sets who performed the action
Result and Severity
WithResult(result, reason?)
- Success, Failure, PartialSuccess, Denied, ErrorWithSeverity(severity)
- Low, Medium, High, Critical
Additional Properties
WithProperty(key, value)
- Add custom key-value pairsWithProperties(dictionary)
- Add multiple properties at onceWithTags(...tags)
- Add categorization tags
Compliance and Review
ForCompliance(standard, requirement?)
- Link to compliance requirementRequiresReview(reason)
- Flag for manual reviewWithEvidence(type, data)
- Attach supporting evidence
Recording
RecordAsync()
- Asynchronously record the eventRecord()
- Synchronously record the event
Configuration
Audit Configuration in nexus-manifest.yaml
audit:
enabled: true
includeDetailedContext: true
encryption:
enabled: true
algorithm: "AES-256-GCM"
signing:
enabled: true
algorithm: "RSA-SHA256"
retentionDays: 2555 # 7 years
compliance:
- "IEC62443"
- "ISO27001"
- "NIST80053"
storage:
- type: "local"
path: "/var/log/nexus/audit"
maxSizeMB: 10000
- type: "remote"
endpoint: "https://audit.company.com/api/v1/logs"
apiKey: "${AUDIT_API_KEY}"
Audit Logging Requirements
What to Audit
Critical Events to Log
- Authentication & Authorization: Login attempts, permission changes, access denials
- Configuration Changes: Module settings, system parameters, operational modes
- Data Access: Read/write operations on sensitive data
- Control Actions: Commands sent to devices, setpoint changes
- System Events: Module lifecycle, errors, warnings
- Security Events: Failed validations, potential attacks, anomalies
Benefits of High-Level API
- Minimal Code: 80% less code compared to manual implementation
- Automatic Context: User ID, timestamp, module ID, session ID captured automatically
- Type Safety: Strongly-typed enums for results, severity, and compliance standards
- Built-in Security: Automatic encryption, signing, and tamper detection
- Compliance Ready: Pre-configured templates for common standards
- Performance: Asynchronous processing, batching, and minimal overhead
- Error Resilience: Audit failures don't crash your module
- Query Support: Structured data enables powerful analysis
Common Audit Patterns
Pattern 1: Batch Operations
When performing multiple related operations, link them with a batch ID:
// C# Example
var batchId = Guid.NewGuid().ToString();
foreach (var item in items)
{
await Audit.DataModified("Configuration", item.Key, item.OldValue, item.NewValue)
.ByUser(userId)
.WithProperty("batchId", batchId)
.WithProperty("itemIndex", items.IndexOf(item))
.RecordAsync();
}
// Record batch completion
await Audit.Custom("BatchComplete")
.ByUser(userId)
.WithProperty("batchId", batchId)
.WithProperty("itemCount", items.Count)
.WithResult(AuditResult.Success)
.RecordAsync();
Pattern 2: Error Handling Integration
Automatically audit errors and exceptions:
// Python Example
try:
result = await perform_critical_operation()
await self.audit.custom("CriticalOperation") \
.by_user(user_id) \
.with_property("operation", "UpdateFirmware") \
.with_result(AuditResult.SUCCESS) \
.record_async()
except Exception as e:
# Audit the failure with high severity
await self.audit.custom("CriticalOperationFailed") \
.by_user(user_id) \
.with_property("operation", "UpdateFirmware") \
.with_property("error", str(e)) \
.with_property("stackTrace", traceback.format_exc()) \
.with_result(AuditResult.FAILURE, str(e)) \
.with_severity(AuditSeverity.CRITICAL) \
.requires_review("Critical operation failed") \
.record_async()
raise
Pattern 3: Compliance-Driven Auditing
Ensure all required events are captured for compliance:
// C++ Example
class ComplianceAwareModule : public ModuleBase {
private:
// Helper for IEC 62443 compliance
template
auto with_iec62443_audit(const std::string& action,
const std::string& user_id,
Func&& operation) {
auto start_time = std::chrono::steady_clock::now();
try {
auto result = operation();
// Success audit with timing
auto duration = std::chrono::steady_clock::now() - start_time;
audit()->custom(action)
->by_user(user_id)
->with_property("duration_ms",
std::chrono::duration_cast(duration).count())
->with_result(AuditResult::Success)
->for_compliance(ComplianceStandard::IEC62443, "SR 2.8")
->record();
return result;
} catch (const std::exception& e) {
// Failure audit
audit()->custom(action)
->by_user(user_id)
->with_result(AuditResult::Failure, e.what())
->with_severity(AuditSeverity::High)
->for_compliance(ComplianceStandard::IEC62443, "SR 2.8")
->requires_review("Operation failed")
->record();
throw;
}
}
};
Pattern 4: Contextual Enrichment
Add rich context to audit events for better analysis:
% MATLAB Example
function performSensitiveOperation(obj, operation, userId)
% Capture context before operation
context = struct();
context.systemLoad = obj.getSystemLoad();
context.activeUsers = obj.getActiveUserCount();
context.moduleVersion = obj.Version;
context.environment = obj.config().get('environment', 'production');
% Create audit builder with context
builder = obj.audit().custom(operation) ...
.byUser(userId) ...
.withProperties(context) ...
.withTags("sensitive", "monitored");
try
% Perform operation
result = obj.executeOperation(operation);
% Add result context
builder.withProperty("resultSize", numel(result)) ...
.withResult(nexus.AuditResult.Success) ...
.record();
catch ex
% Add error context
builder.withProperty("errorType", class(ex)) ...
.withProperty("errorLocation", ex.stack(1).name) ...
.withResult(nexus.AuditResult.Failure, ex.message) ...
.withSeverity(nexus.AuditSeverity.High) ...
.record();
rethrow(ex);
end
end
Audit Log Analysis and Reporting
Query Examples
# Find all failed login attempts in the last 24 hours
SELECT * FROM audit_logs
WHERE event_type = 'AUTHENTICATION'
AND result = 'FAILURE'
AND timestamp > NOW() - INTERVAL '24 hours'
ORDER BY timestamp DESC;
# Identify configuration changes by user
SELECT
user_id,
COUNT(*) as change_count,
ARRAY_AGG(DISTINCT resource_type) as affected_resources
FROM audit_logs
WHERE event_type = 'CONFIGURATION_CHANGE'
AND timestamp > NOW() - INTERVAL '7 days'
GROUP BY user_id
ORDER BY change_count DESC;
# Detect anomalous access patterns
WITH user_access AS (
SELECT
user_id,
DATE_TRUNC('hour', timestamp) as hour,
COUNT(*) as access_count
FROM audit_logs
WHERE event_type IN ('DATA_ACCESS', 'CONTROL_ACTION')
GROUP BY user_id, hour
)
SELECT
user_id,
hour,
access_count,
AVG(access_count) OVER (
PARTITION BY user_id
ORDER BY hour
ROWS BETWEEN 168 PRECEDING AND 1 PRECEDING
) as avg_hourly_access
FROM user_access
WHERE access_count > avg_hourly_access * 3; -- 3x normal activity
Compliance and Standards
Industry Standards for Audit Logging
- IEC 62443: Security for industrial automation and control systems
- ISO 27001: Information security management
- NIST 800-53: Security and privacy controls
- FDA 21 CFR Part 11: Electronic records for pharmaceutical
- NERC CIP: Critical infrastructure protection
Common Requirements
- User identification and authentication events
- System configuration changes
- Data creation, modification, and deletion
- Security policy changes
- System errors and failures
- Backup and restore operations
Best Practices
Audit Logging Guidelines
- Log Consistently: Use structured logging with consistent field names
- Include Context: Capture who, what, when, where, why, and how
- Protect Logs: Encrypt sensitive data and ensure tamper-resistance
- Monitor Performance: Ensure logging doesn't impact system performance
- Test Recovery: Regularly verify you can retrieve and analyze logs
- Automate Analysis: Set up alerts for suspicious patterns
- Document Retention: Clear policies on what to log and for how long
- Time Synchronization: Ensure all systems use synchronized clocks (NTP)
- Fail Securely: If audit logging fails, system should fail to a secure state
- Regular Reviews: Periodically review audit logs for security issues
Cross-Platform Development
Build distributed systems that seamlessly integrate modules written in different programming languages. The NEXUS-1 SDK's language-agnostic architecture enables true cross-platform development.
Overview
Why Cross-Platform Development?
Modern industrial systems often require combining the strengths of different languages:
- C++: High-performance real-time processing and hardware interfaces
- Python: Machine learning, data analysis, and rapid prototyping
- C#/.NET: Enterprise integration and business logic
- MATLAB: Advanced algorithms and scientific computing
- LabVIEW: Test instrumentation and legacy system integration
Architecture Patterns
Microservices Pattern
Each module runs as an independent service, communicating through the message bus.
# System Architecture Example
modules:
# High-speed data acquisition in C++
- id: data-acquisition
language: cpp
capabilities: [sensor-data, real-time]
# ML processing in Python
- id: anomaly-detection
language: python
capabilities: [machine-learning, analytics]
subscribes: [sensor-data]
# Business rules in C#
- id: alert-manager
language: csharp
capabilities: [notifications, rules-engine]
subscribes: [anomalies]
# Control algorithms in MATLAB
- id: pid-controller
language: matlab
capabilities: [control-systems]
subscribes: [sensor-data]
publishes: [control-signals]
Pipeline Pattern
Data flows through a series of processing stages, each implemented in the most suitable language.
┌─────────────┐ ┌──────────────┐ ┌─────────────┐ ┌──────────────┐ │ C++ DAQ │ ──► │ Python Filter│ ──► │ MATLAB Algo │ ──► │ C# Business │ │ (100 kHz) │ │ (Decimation)│ │ (Analysis) │ │ Logic │ └─────────────┘ └──────────────┘ └─────────────┘ └──────────────┘
Hub-and-Spoke Pattern
A central coordinator manages workflows across specialized modules.
// C# Workflow Coordinator
public class WorkflowCoordinator : ModuleBase
{
protected override async Task OnInitializeAsync()
{
// Subscribe to workflow triggers
await Messages.SubscribeAsync("workflow.start", async (msg) =>
{
var workflow = msg.GetPayload();
// Step 1: Acquire data (C++)
var dataRequest = await Messages.RequestAsync(
"data-acquisition/acquire",
new { SampleRate = 100000, Duration = 10 }
);
// Step 2: Process with ML (Python)
var mlResult = await Messages.RequestAsync(
"ml-processor/analyze",
new { Data = dataRequest.Data, Model = "anomaly_v2" }
);
// Step 3: Apply control (MATLAB)
if (mlResult.AnomalyDetected)
{
await Messages.PublishAsync(
"control-system/adjust",
new { Parameters = mlResult.Recommendations }
);
}
});
}
}
Interoperability Guidelines
Message Format Standardization
Use consistent message formats across all languages to ensure seamless communication.
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
"timestamp": {
"type": "string",
"format": "date-time"
},
"sensorId": {
"type": "string"
},
"value": {
"type": "number"
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit", "kelvin"]
},
"metadata": {
"type": "object"
}
},
"required": ["timestamp", "sensorId", "value", "unit"]
}
syntax = "proto3";
package nexus.sensors;
import "google/protobuf/timestamp.proto";
message SensorReading {
google.protobuf.Timestamp timestamp = 1;
string sensor_id = 2;
double value = 3;
enum Unit {
CELSIUS = 0;
FAHRENHEIT = 1;
KELVIN = 2;
}
Unit unit = 4;
map metadata = 5;
}
# MessagePack schema definition
SensorReading:
fields:
- name: timestamp
type: timestamp
id: 0
- name: sensor_id
type: str
id: 1
- name: value
type: float64
id: 2
- name: unit
type: enum[celsius, fahrenheit, kelvin]
id: 3
- name: metadata
type: map
id: 4
Type Mapping Between Languages
Understand how data types map between different languages to avoid conversion issues.
Data Type | C#/.NET | Python | C++ | MATLAB | LabVIEW |
---|---|---|---|---|---|
Integer (32-bit) | int | int | int32_t | int32 | I32 |
Float (64-bit) | double | float | double | double | DBL |
String | string | str | std::string | char array | String |
Boolean | bool | bool | bool | logical | Boolean |
Array | T[] | list | std::vector<T> | array | Array |
Dictionary | Dictionary<K,V> | dict | std::map<K,V> | containers.Map | Variant |
Timestamp | DateTime | datetime | std::chrono::time_point | datetime | Timestamp |
Handling Endianness and Byte Order
Ensure consistent byte ordering when sharing binary data between platforms.
// C++ - Sending binary data
void sendSensorData(const std::vector& data) {
// Convert to network byte order (big-endian)
std::vector networkData;
for (float value : data) {
uint32_t bits;
std::memcpy(&bits, &value, sizeof(float));
networkData.push_back(htonl(bits));
}
nexus::Message msg;
msg.set_topic("sensor.data.raw");
msg.set_content_type("application/octet-stream");
msg.set_body(networkData.data(), networkData.size() * sizeof(uint32_t));
message_bus_->publish(msg);
}
# Python - Receiving binary data
import struct
def on_sensor_data(message):
if message.content_type == "application/octet-stream":
# Unpack from network byte order
data = struct.unpack(f'>{len(message.body)//4}f', message.body)
process_sensor_data(data)
Development Workflow
Multi-Language Project Structure
Organize your cross-platform project for maximum maintainability.
nexus-project/
├── modules/
│ ├── cpp/
│ │ ├── data-acquisition/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── src/
│ │ │ └── tests/
│ │ └── signal-processing/
│ ├── python/
│ │ ├── ml-analytics/
│ │ │ ├── requirements.txt
│ │ │ ├── src/
│ │ │ └── tests/
│ │ └── data-pipeline/
│ ├── csharp/
│ │ ├── BusinessLogic/
│ │ │ ├── BusinessLogic.csproj
│ │ │ └── src/
│ │ └── WebAPI/
│ ├── matlab/
│ │ └── control-algorithms/
│ └── labview/
│ └── test-automation/
├── shared/
│ ├── schemas/ # Shared message schemas
│ ├── contracts/ # Interface definitions
│ └── documentation/
├── integration-tests/ # Cross-module tests
├── docker-compose.yml # Multi-container setup
└── nexus-manifest.yaml # System configuration
Shared Development Practices
1. Version Control Strategy
- Use a monorepo for tightly coupled modules
- Separate repos with git submodules for loosely coupled systems
- Tag releases across all languages simultaneously
2. Continuous Integration
# .github/workflows/cross-platform-ci.yml
name: Cross-Platform CI
on: [push, pull_request]
jobs:
cpp-modules:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Build C++ modules
run: |
cd modules/cpp
cmake -B build
cmake --build build
ctest --test-dir build
python-modules:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: '3.9'
- name: Test Python modules
run: |
cd modules/python
pip install -r requirements.txt
pytest
csharp-modules:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-dotnet@v3
with:
dotnet-version: '8.0.x'
- name: Test C# modules
run: |
cd modules/csharp
dotnet test
integration-tests:
needs: [cpp-modules, python-modules, csharp-modules]
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Run integration tests
run: |
docker-compose up -d
./run-integration-tests.sh
Testing Cross-Platform Systems
Integration Testing Strategies
# docker-compose.test.yml
version: '3.8'
services:
nexus-host:
image: nexus-host:latest
volumes:
- ./test-manifest.yaml:/app/nexus-manifest.yaml
ports:
- "8080:8080"
cpp-daq:
build: ./modules/cpp/data-acquisition
environment:
- NEXUS_HOST=nexus-host:8080
depends_on:
- nexus-host
python-ml:
build: ./modules/python/ml-analytics
environment:
- NEXUS_HOST=nexus-host:8080
depends_on:
- nexus-host
csharp-logic:
build: ./modules/csharp/BusinessLogic
environment:
- NEXUS_HOST=nexus-host:8080
depends_on:
- nexus-host
test-runner:
build: ./integration-tests
environment:
- NEXUS_HOST=nexus-host:8080
depends_on:
- cpp-daq
- python-ml
- csharp-logic
command: pytest -v /tests
# Python integration test framework
import pytest
import asyncio
from nexus_sdk import NexusClient
class TestCrossPlatformIntegration:
@pytest.fixture
async def nexus_client(self):
client = NexusClient("ws://localhost:8080")
await client.connect()
yield client
await client.disconnect()
@pytest.mark.asyncio
async def test_cpp_to_python_pipeline(self, nexus_client):
# Subscribe to ML results
results = []
await nexus_client.subscribe(
"ml.results.*",
lambda msg: results.append(msg)
)
# Trigger C++ data acquisition
await nexus_client.publish(
"daq.commands.start",
{"sample_rate": 1000, "duration": 1}
)
# Wait for pipeline to complete
await asyncio.sleep(2)
# Verify ML processing occurred
assert len(results) > 0
assert results[0].payload.get("model_version") == "v2.1"
assert "predictions" in results[0].payload
@pytest.mark.asyncio
async def test_full_workflow(self, nexus_client):
# Test complete cross-platform workflow
workflow_complete = asyncio.Event()
async def on_workflow_complete(msg):
workflow_complete.set()
await nexus_client.subscribe(
"workflow.complete",
on_workflow_complete
)
# Start workflow
await nexus_client.publish(
"workflow.start",
{"type": "anomaly_detection", "priority": "high"}
)
# Wait for completion with timeout
await asyncio.wait_for(
workflow_complete.wait(),
timeout=30.0
)
// Contract testing with Pact
// C# Consumer Test
[Test]
public async Task PythonMLService_ProcessesData()
{
var pact = new PactBuilder()
.Consumer("CSharpBusinessLogic")
.Provider("PythonMLService")
.Build();
pact.UponReceiving("a request to process sensor data")
.Given("ML model v2.1 is loaded")
.WithRequest("POST", "/analyze")
.WithJsonBody(new
{
data = new[] { 1.2, 3.4, 5.6 },
model = "anomaly_v2"
})
.WillRespondWith(200)
.WithJsonBody(new
{
predictions = new[] { 0.1, 0.2, 0.8 },
anomaly_detected = true,
confidence = 0.95
});
await pact.VerifyAsync(async () =>
{
var client = new HttpClient { BaseAddress = pact.MockServerUri };
var response = await client.PostAsJsonAsync("/analyze", new
{
data = new[] { 1.2, 3.4, 5.6 },
model = "anomaly_v2"
});
Assert.That(response.IsSuccessStatusCode);
});
}
# Python Provider Verification
from pact import Verifier
def test_pact_verification():
verifier = Verifier()
success = verifier.verify_pacts(
provider='PythonMLService',
provider_base_url='http://localhost:5000',
pact_urls=['./pacts/CSharpBusinessLogic-PythonMLService.json']
)
assert success == 0
Performance Optimization
Cross-Language Communication Overhead
Minimize serialization costs and optimize data transfer between modules.
1. Binary Protocols for High-Frequency Data
// C++ - Efficient binary serialization
class BinarySerializer {
public:
static std::vector serialize(const SensorData& data) {
flatbuffers::FlatBufferBuilder builder(1024);
auto readings = builder.CreateVector(data.readings);
auto sensorData = CreateSensorData(
builder,
data.timestamp,
data.sensor_id,
readings
);
builder.Finish(sensorData);
return std::vector(
builder.GetBufferPointer(),
builder.GetBufferPointer() + builder.GetSize()
);
}
};
2. Batch Processing
# Python - Batch processing for efficiency
class BatchProcessor:
def __init__(self, batch_size=100, timeout=1.0):
self.batch_size = batch_size
self.timeout = timeout
self.buffer = []
self.last_flush = time.time()
async def add_item(self, item):
self.buffer.append(item)
if len(self.buffer) >= self.batch_size or \
time.time() - self.last_flush > self.timeout:
await self.flush()
async def flush(self):
if self.buffer:
# Process entire batch at once
results = await self.ml_model.predict_batch(self.buffer)
await self.publish_results(results)
self.buffer.clear()
self.last_flush = time.time()
3. Zero-Copy Techniques
// C++ - Zero-copy shared memory for local modules
class SharedMemoryTransport {
boost::interprocess::mapped_region region;
public:
void send_large_dataset(const float* data, size_t size) {
// Write directly to shared memory
auto* shared_data = static_cast(region.get_address());
std::memcpy(shared_data, data, size * sizeof(float));
// Send only metadata through message bus
nexus::Message msg;
msg.set_topic("data.ready");
msg.set_payload({
{"type", "shared_memory"},
{"region", "sensor_data_001"},
{"size", size},
{"offset", 0}
});
publish(msg);
}
};
Real-World Examples
Example 1: Industrial IoT Data Pipeline
Combining high-speed data acquisition with ML-based predictive maintenance.
┌──────────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ C++ DAQ │ │Python Feature│ │ Python ML │ │ C# Business │ │ │ │ Engineering │ │ Inference │ │ Rules │ │ • 100kHz ADC │ ───► │ • FFT │ ───► │ • TensorFlow │ ───► │ • Alerts │ │ • Filtering │ │ • Statistics │ │ • Anomaly │ │ • Reports │ │ • Buffering │ │ • Windowing │ │ Detection │ │ • Dashboard │ └──────────────┘ └──────────────┘ └──────────────┘ └──────────────┘
# Configuration for industrial IoT pipeline
modules:
- id: high-speed-daq
language: cpp
config:
sample_rate: 100000 # 100 kHz
channels: [0, 1, 2, 3]
buffer_size: 10000
- id: feature-extractor
language: python
config:
window_size: 1024
overlap: 0.5
features: [fft, rms, peak, crest_factor]
subscribes:
- topic: "raw.sensor.data"
batch_size: 10
- id: anomaly-detector
language: python
config:
model_path: "/models/vibration_anomaly_v3.h5"
threshold: 0.85
subscribes:
- topic: "features.extracted"
- id: maintenance-manager
language: csharp
config:
database: "SqlServer"
notification_channels: ["email", "sms", "teams"]
subscribes:
- topic: "anomalies.detected"
Example 2: Hybrid Control System
MATLAB control algorithms with C++ real-time execution and Python optimization.
%% MATLAB - Control Algorithm Design
classdef MPCController < nexus.Module
properties
horizon = 10
constraints
model
end
methods
function obj = MPCController()
obj@nexus.Module('MPC-Controller', '1.0.0');
% Load system model
obj.model = ss(A, B, C, D);
% Define constraints
obj.constraints = struct(...
'umin', -10, ...
'umax', 10, ...
'ymin', -inf, ...
'ymax', inf ...
);
end
function onInitialize(obj)
% Export controller for C++ implementation
obj.exportController();
% Subscribe to optimization requests
obj.messages.subscribe('control.optimize', @obj.optimizeController);
end
function exportController(obj)
% Generate C++ code from MATLAB
cfg = coder.config('lib');
cfg.TargetLang = 'C++';
codegen -config cfg mpc_step -args {zeros(4,1), 1.0}
% Publish generated code location
obj.messages.publish('control.code.ready', struct(...
'path', '/generated/mpc_controller.cpp', ...
'version', '1.0.0' ...
));
end
end
end
// C++ - Real-time Controller Execution
class RealtimeController : public nexus::ModuleBase {
std::unique_ptr controller_;
protected:
async_task on_initialize() override {
// Wait for MATLAB-generated code
auto code_msg = co_await messages()->request(
"control.code.ready",
std::chrono::seconds(30)
);
// Load generated controller
controller_ = std::make_unique(
code_msg["path"].get()
);
// Subscribe to sensor data at high frequency
co_await messages()->subscribe(
"sensors.state",
[this](auto msg) { return execute_control(msg); }
);
}
async_task execute_control(const nexus::Message& msg) {
auto state = msg.get_payload();
// Execute control law (guaranteed < 1ms)
auto start = std::chrono::high_resolution_clock::now();
auto control_output = controller_->compute(state);
auto duration = std::chrono::high_resolution_clock::now() - start;
// Publish control signal
co_await messages()->publish("actuators.command", control_output);
// Send telemetry
co_await messages()->publish("control.telemetry", {
{"execution_time_us",
std::chrono::duration_cast(duration).count()},
{"timestamp", std::chrono::system_clock::now()}
});
}
};
Example 3: Legacy System Integration
Integrating LabVIEW test equipment with modern microservices.
// LabVIEW - Legacy Test Equipment Wrapper
// TestEquipmentModule.lvclass
// Initialize.vi
1. Initialize VISA Resources
- Find all connected instruments
- Open VISA sessions
2. Configure Message Bus Connection
- Create NEXUS client
- Register module capabilities
3. Start Polling Loop
- Read instrument data at configured rate
- Publish to message bus
// Handle Commands.vi
Case Structure for Command Routing:
Case "configure":
- Parse configuration from message
- Apply to instrument via VISA Write
- Return confirmation
Case "measure":
- Trigger measurement
- Read result via VISA Read
- Parse and publish data
Case "calibrate":
- Run calibration sequence
- Store calibration factors
- Publish calibration report
// Publish Measurement.vi
1. Create Measurement Cluster
- Timestamp
- Instrument ID
- Channel
- Value
- Unit
- Validity flags
2. Convert to JSON
- Use JSON Encode.vi
- Add metadata
3. Publish to Message Bus
- Set topic: "instruments.[instrument_id].data"
- Set content type: "application/json"
- Send message
Integration Benefits
- Preserve existing LabVIEW investment while modernizing architecture
- Enable real-time data streaming from test equipment
- Add modern analytics and ML capabilities to legacy systems
- Implement centralized monitoring and control
Best Practices Summary
Cross-Platform Development Guidelines
- Design for Language Strengths: Use each language where it excels
- Standardize Interfaces: Define clear contracts between modules
- Handle Errors Gracefully: Plan for cross-language error propagation
- Monitor Performance: Track overhead at language boundaries
- Version Carefully: Coordinate updates across all languages
- Document Thoroughly: Explain integration points and data flows
- Test Exhaustively: Include integration tests in CI/CD pipeline
- Plan for Scale: Design for distributed deployment from the start
Self-Healing Systems
Build resilient, autonomous systems that detect and recover from failures automatically. The NEXUS-1 SDK provides the foundation for implementing self-healing capabilities that maintain high availability without human intervention.
Overview
Why Self-Healing Systems?
In mission-critical industrial environments, system failures can have severe consequences:
- Downtime Costs: Every minute of downtime can result in significant financial losses
- Safety Risks: Manual intervention may not be fast enough to prevent hazards
- 24/7 Operations: Systems must operate continuously without human supervision
- Complex Dependencies: Modern systems have intricate interdependencies that require intelligent recovery
- Scale Challenges: Manual management becomes impractical as systems grow
Self-Healing Patterns
Health Check Pattern
Implement continuous health monitoring to detect issues before they cause failures.
public class SelfHealingModule : ModuleBase
{
private readonly HealthChecker _healthChecker;
private readonly RecoveryManager _recoveryManager;
public SelfHealingModule()
{
_healthChecker = new HealthChecker();
_recoveryManager = new RecoveryManager();
}
protected override async Task OnInitializeAsync()
{
// Register health checks
_healthChecker.AddCheck("memory", CheckMemoryHealth);
_healthChecker.AddCheck("connections", CheckConnectionHealth);
_healthChecker.AddCheck("processing", CheckProcessingHealth);
// Start continuous health monitoring
_ = Task.Run(MonitorHealthAsync);
}
private async Task MonitorHealthAsync()
{
while (!CancellationToken.IsCancellationRequested)
{
var healthReport = await _healthChecker.CheckAllAsync();
if (healthReport.Status != HealthStatus.Healthy)
{
await HandleUnhealthyState(healthReport);
}
// Publish health status
await Messages.PublishAsync("health.status", healthReport);
await Task.Delay(TimeSpan.FromSeconds(30));
}
}
private async Task CheckMemoryHealth()
{
var process = Process.GetCurrentProcess();
var memoryMB = process.WorkingSet64 / (1024 * 1024);
if (memoryMB > 1000) // 1GB threshold
{
return HealthCheckResult.Unhealthy(
"High memory usage detected",
new { MemoryMB = memoryMB }
);
}
return HealthCheckResult.Healthy();
}
private async Task HandleUnhealthyState(HealthReport report)
{
Logger.Warning("Unhealthy state detected", report);
// Attempt self-healing
foreach (var issue in report.UnhealthyChecks)
{
var recovery = _recoveryManager.GetRecoveryStrategy(issue.Name);
if (recovery != null)
{
await recovery.ExecuteAsync(issue);
}
}
}
}
from nexus_sdk import Module
import asyncio
import psutil
from enum import Enum
from typing import Dict, Callable, Any
class HealthStatus(Enum):
HEALTHY = "healthy"
DEGRADED = "degraded"
UNHEALTHY = "unhealthy"
class SelfHealingModule(Module):
def __init__(self):
super().__init__()
self.health_checks = {}
self.recovery_strategies = {}
self.health_history = []
async def on_initialize(self):
# Register health checks
self.add_health_check("memory", self.check_memory_health)
self.add_health_check("cpu", self.check_cpu_health)
self.add_health_check("queue", self.check_queue_health)
# Register recovery strategies
self.add_recovery("memory", self.recover_memory)
self.add_recovery("cpu", self.recover_cpu_load)
# Start health monitoring
asyncio.create_task(self.monitor_health())
def add_health_check(self, name: str, check_func: Callable):
self.health_checks[name] = check_func
def add_recovery(self, name: str, recovery_func: Callable):
self.recovery_strategies[name] = recovery_func
async def monitor_health(self):
while not self.shutdown_requested:
health_report = await self.perform_health_checks()
# Record history for trend analysis
self.health_history.append(health_report)
if len(self.health_history) > 100:
self.health_history.pop(0)
# Analyze trends
if self.detect_degradation_trend():
await self.preemptive_recovery()
# Handle current issues
if health_report["status"] != HealthStatus.HEALTHY:
await self.handle_unhealthy_state(health_report)
# Publish health metrics
await self.messages.publish("health.status", health_report)
await asyncio.sleep(30)
async def check_memory_health(self) -> Dict[str, Any]:
memory = psutil.virtual_memory()
if memory.percent > 80:
return {
"status": HealthStatus.UNHEALTHY,
"message": "High memory usage",
"metrics": {
"percent": memory.percent,
"available_mb": memory.available / 1024 / 1024
}
}
elif memory.percent > 60:
return {
"status": HealthStatus.DEGRADED,
"message": "Moderate memory usage",
"metrics": {"percent": memory.percent}
}
return {"status": HealthStatus.HEALTHY}
async def recover_memory(self, health_check: Dict):
self.logger.info("Attempting memory recovery")
# Clear caches
self.clear_internal_caches()
# Force garbage collection
import gc
gc.collect()
# If still high, shed load
if psutil.virtual_memory().percent > 70:
await self.shed_non_critical_load()
def detect_degradation_trend(self) -> bool:
if len(self.health_history) < 5:
return False
# Check if health has been degrading
recent_statuses = [h["status"] for h in self.health_history[-5:]]
degraded_count = sum(1 for s in recent_statuses
if s != HealthStatus.HEALTHY)
return degraded_count >= 3
#include <nexus/module.hpp>
#include <chrono>
#include <thread>
class SelfHealingModule : public nexus::ModuleBase {
private:
struct HealthCheck {
std::string name;
std::function<HealthStatus()> checker;
std::chrono::steady_clock::time_point last_healthy;
};
std::vector<HealthCheck> health_checks_;
std::atomic<bool> monitoring_active_{true};
std::thread health_monitor_thread_;
public:
async_task<void> on_initialize() override {
// Register health checks
register_health_check("latency",
[this] { return check_latency_health(); });
register_health_check("throughput",
[this] { return check_throughput_health(); });
register_health_check("errors",
[this] { return check_error_rate_health(); });
// Start health monitoring thread
health_monitor_thread_ = std::thread([this] {
monitor_health_loop();
});
co_return;
}
void register_health_check(const std::string& name,
std::function<HealthStatus()> checker) {
health_checks_.push_back({name, checker,
std::chrono::steady_clock::now()});
}
void monitor_health_loop() {
while (monitoring_active_) {
auto overall_status = HealthStatus::Healthy;
std::vector<HealthIssue> issues;
// Run all health checks
for (auto& check : health_checks_) {
auto status = check.checker();
if (status == HealthStatus::Healthy) {
check.last_healthy = std::chrono::steady_clock::now();
} else {
auto unhealthy_duration =
std::chrono::steady_clock::now() - check.last_healthy;
issues.push_back({
check.name,
status,
unhealthy_duration
});
if (status == HealthStatus::Critical) {
overall_status = HealthStatus::Critical;
} else if (overall_status != HealthStatus::Critical) {
overall_status = HealthStatus::Degraded;
}
}
}
// Handle any issues
if (!issues.empty()) {
handle_health_issues(issues);
}
// Publish health status
publish_health_status(overall_status, issues);
std::this_thread::sleep_for(std::chrono::seconds(30));
}
}
void handle_health_issues(const std::vector<HealthIssue>& issues) {
for (const auto& issue : issues) {
// Log the issue
logger()->warn("Health issue detected: {} - Status: {}",
issue.name, to_string(issue.status));
// Apply recovery strategy based on issue type
if (issue.name == "latency" &&
issue.duration > std::chrono::minutes(5)) {
// Reset connections if latency persists
reset_connections();
} else if (issue.name == "errors" &&
issue.status == HealthStatus::Critical) {
// Circuit breaker pattern
enable_circuit_breaker();
}
}
}
};
Circuit Breaker Pattern
The SDK provides built-in circuit breaker functionality to prevent cascading failures by temporarily disabling failing components.
public class ResilientServiceModule : ModuleBase
{
private readonly ICircuitBreaker _externalApiBreaker;
private readonly ICircuitBreaker _databaseBreaker;
public ResilientServiceModule()
{
// Create circuit breaker for external API calls
_externalApiBreaker = Recovery.CreateCircuitBreaker(
name: "external-api",
failureThreshold: 5, // Open after 5 failures
samplingDuration: TimeSpan.FromMinutes(1), // Within 1 minute
minimumThroughput: 10, // Minimum 10 calls before evaluating
breakDuration: TimeSpan.FromSeconds(30) // Stay open for 30 seconds
);
// Create circuit breaker for database operations
_databaseBreaker = Recovery.CreateCircuitBreaker(
name: "database",
failureThreshold: 3,
samplingDuration: TimeSpan.FromSeconds(30),
breakDuration: TimeSpan.FromMinutes(1)
);
}
public async Task CallExternalApiAsync(ApiRequest request)
{
try
{
// Execute operation through circuit breaker
return await _externalApiBreaker.ExecuteAsync(async () =>
{
using var client = new HttpClient();
var response = await client.PostAsJsonAsync(
"https://api.example.com/data",
request
);
response.EnsureSuccessStatusCode();
return await response.Content.ReadFromJsonAsync();
});
}
catch (CircuitBreakerOpenException ex)
{
Logger.Warning($"Circuit breaker is open: {ex.Message}");
// Return cached or default response
return GetCachedResponse(request) ?? ApiResponse.Empty;
}
}
// Monitor circuit breaker state
protected override async Task OnInitializeAsync()
{
// Subscribe to circuit breaker state changes
_externalApiBreaker.StateChanged += (sender, args) =>
{
Logger.Info($"Circuit breaker '{args.Name}' changed from " +
$"{args.PreviousState} to {args.CurrentState}");
// Publish state change for monitoring
Messages.PublishAsync("circuit-breaker.state-changed", new
{
Name = args.Name,
PreviousState = args.PreviousState.ToString(),
CurrentState = args.CurrentState.ToString(),
Timestamp = DateTime.UtcNow
});
};
// Register health check
Health.AddCheck("external-api-circuit", () =>
{
return _externalApiBreaker.State == CircuitState.Closed
? HealthCheckResult.Healthy()
: HealthCheckResult.Degraded($"Circuit breaker is {_externalApiBreaker.State}");
});
}
}
Automatic Failover Pattern
Seamlessly switch to backup systems when primary systems fail.
public class FailoverManager : ModuleBase
{
private readonly List _endpoints;
private IServiceEndpoint _activeEndpoint;
private readonly object _lock = new object();
protected override async Task OnInitializeAsync()
{
// Register primary and backup endpoints
_endpoints = new List
{
new ServiceEndpoint("primary", "tcp://primary:5000", priority: 1),
new ServiceEndpoint("secondary", "tcp://backup1:5000", priority: 2),
new ServiceEndpoint("tertiary", "tcp://backup2:5000", priority: 3)
};
// Start with highest priority endpoint
_activeEndpoint = _endpoints.OrderBy(e => e.Priority).First();
// Monitor endpoint health
_ = Task.Run(MonitorEndpointsAsync);
}
private async Task MonitorEndpointsAsync()
{
while (!CancellationToken.IsCancellationRequested)
{
var healthyEndpoints = new List();
// Check all endpoints
foreach (var endpoint in _endpoints)
{
if (await endpoint.IsHealthyAsync())
{
healthyEndpoints.Add(endpoint);
}
}
// Failover if needed
lock (_lock)
{
if (!_activeEndpoint.IsHealthy && healthyEndpoints.Any())
{
var newEndpoint = healthyEndpoints
.OrderBy(e => e.Priority)
.First();
Logger.Warning($"Failing over from {_activeEndpoint.Name} " +
$"to {newEndpoint.Name}");
PerformFailover(_activeEndpoint, newEndpoint);
_activeEndpoint = newEndpoint;
}
}
await Task.Delay(TimeSpan.FromSeconds(10));
}
}
private void PerformFailover(IServiceEndpoint from, IServiceEndpoint to)
{
// Drain in-flight requests
from.DrainConnections();
// Transfer state if needed
var state = from.ExportState();
to.ImportState(state);
// Switch traffic
UpdateRoutingTable(to);
// Notify dependent modules
Messages.PublishAsync("failover.completed", new
{
From = from.Name,
To = to.Name,
Timestamp = DateTime.UtcNow
});
}
}
Failure Detection
Anomaly Detection
Use statistical analysis and machine learning to detect abnormal behavior.
import numpy as np
from collections import deque
from nexus_sdk import Module
class AnomalyDetector(Module):
def __init__(self):
super().__init__()
self.metrics_history = {}
self.anomaly_thresholds = {}
self.window_size = 100
async def on_initialize(self):
# Subscribe to system metrics
await self.messages.subscribe(
"metrics.*",
self.analyze_metric
)
# Initialize ML models for complex patterns
self.load_anomaly_models()
def load_anomaly_models(self):
# Load pre-trained isolation forest model
self.isolation_forest = self.load_model(
"models/system_anomaly_detector.pkl"
)
async def analyze_metric(self, message):
metric_name = message.topic.split('.')[-1]
value = message.payload['value']
# Initialize history if needed
if metric_name not in self.metrics_history:
self.metrics_history[metric_name] = deque(
maxlen=self.window_size
)
history = self.metrics_history[metric_name]
history.append(value)
# Need enough data for analysis
if len(history) < 20:
return
# Statistical anomaly detection
if self.is_statistical_anomaly(metric_name, value, history):
await self.handle_anomaly(
metric_name,
value,
"statistical"
)
# Pattern-based anomaly detection
if self.is_pattern_anomaly(metric_name, history):
await self.handle_anomaly(
metric_name,
value,
"pattern"
)
def is_statistical_anomaly(self, metric, value, history):
# Calculate statistics
values = list(history)
mean = np.mean(values)
std = np.std(values)
# Adaptive threshold based on metric stability
if std < 0.1 * mean: # Stable metric
threshold = 3
else: # Volatile metric
threshold = 4
# Check if value is outside threshold
z_score = abs((value - mean) / std) if std > 0 else 0
return z_score > threshold
def is_pattern_anomaly(self, metric, history):
# Detect sudden changes in pattern
if len(history) < self.window_size:
return False
# Compare recent pattern with historical
recent = list(history)[-20:]
historical = list(history)[:-20]
# Use statistical tests
from scipy import stats
_, p_value = stats.ks_2samp(recent, historical)
return p_value < 0.01 # Significant difference
async def handle_anomaly(self, metric, value, anomaly_type):
self.logger.warning(
f"Anomaly detected in {metric}: {value} ({anomaly_type})"
)
# Publish anomaly event
await self.messages.publish("anomaly.detected", {
"metric": metric,
"value": value,
"type": anomaly_type,
"timestamp": self.get_timestamp(),
"severity": self.calculate_severity(metric, value)
})
# Trigger self-healing if critical
if self.is_critical_metric(metric):
await self.initiate_recovery(metric, value)
public class AnomalyDetector : ModuleBase
{
private readonly Dictionary _analyzers;
private readonly IMLModel _anomalyModel;
public AnomalyDetector()
{
_analyzers = new Dictionary();
_anomalyModel = LoadAnomalyModel();
}
protected override async Task OnInitializeAsync()
{
// Subscribe to all metrics
await Messages.SubscribeAsync("metrics.*", AnalyzeMetricAsync);
// Start predictive analysis
_ = Task.Run(PredictiveAnalysisLoop);
}
private async Task AnalyzeMetricAsync(Message message)
{
var metricName = message.Topic.Split('.').Last();
var value = message.GetPayload();
// Get or create analyzer for this metric
if (!_analyzers.TryGetValue(metricName, out var analyzer))
{
analyzer = new MetricAnalyzer(metricName);
_analyzers[metricName] = analyzer;
}
// Analyze for anomalies
var anomalies = analyzer.Analyze(value);
foreach (var anomaly in anomalies)
{
await HandleAnomalyAsync(anomaly);
}
}
private async Task PredictiveAnalysisLoop()
{
while (!CancellationToken.IsCancellationRequested)
{
// Collect recent metrics
var metricsSnapshot = CollectMetricsSnapshot();
// Run ML model for complex pattern detection
var predictions = await _anomalyModel.PredictAsync(
metricsSnapshot
);
// Check for predicted failures
foreach (var prediction in predictions
.Where(p => p.FailureProbability > 0.8))
{
await Messages.PublishAsync("failure.predicted", new
{
Component = prediction.Component,
Probability = prediction.FailureProbability,
TimeToFailure = prediction.EstimatedTimeToFailure,
RecommendedAction = prediction.RecommendedAction
});
// Initiate preemptive recovery
if (prediction.FailureProbability > 0.95)
{
await InitiatePreemptiveRecovery(
prediction.Component
);
}
}
await Task.Delay(TimeSpan.FromMinutes(5));
}
}
}
public class MetricAnalyzer
{
private readonly string _metricName;
private readonly RingBuffer _values;
private readonly AdaptiveThreshold _threshold;
public MetricAnalyzer(string metricName)
{
_metricName = metricName;
_values = new RingBuffer(1000);
_threshold = new AdaptiveThreshold();
}
public IEnumerable Analyze(MetricValue value)
{
_values.Add(value.Value);
var anomalies = new List();
// Spike detection
if (_threshold.IsSpike(value.Value, _values))
{
anomalies.Add(new Anomaly
{
Type = AnomalyType.Spike,
Metric = _metricName,
Value = value.Value,
Severity = CalculateSeverity(value.Value)
});
}
// Trend detection
if (DetectTrend(_values) is Trend trend &&
trend.IsAnomalous)
{
anomalies.Add(new Anomaly
{
Type = AnomalyType.Trend,
Metric = _metricName,
TrendInfo = trend
});
}
return anomalies;
}
}
Recovery Mechanisms
Automatic Recovery Strategies
Implement various recovery strategies based on the type and severity of failures.
public interface IRecoveryStrategy
{
string Name { get; }
bool CanRecover(FailureContext context);
Task RecoverAsync(FailureContext context);
}
public class RecoveryOrchestrator : ModuleBase
{
private readonly List _strategies;
private readonly RecoveryHistory _history;
public RecoveryOrchestrator()
{
_strategies = new List
{
new RestartRecovery(),
new ResourceCleanupRecovery(),
new StateResetRecovery(),
new LoadSheddingRecovery(),
new FailoverRecovery()
};
_history = new RecoveryHistory();
}
protected override async Task OnInitializeAsync()
{
// Subscribe to failure events
await Messages.SubscribeAsync(
"failure.detected",
HandleFailureAsync
);
}
private async Task HandleFailureAsync(Message message)
{
var context = message.GetPayload();
// Check if we're in a recovery loop
if (_history.IsRecoveryLooping(context))
{
await EscalateFailure(context);
return;
}
// Find applicable recovery strategies
var applicableStrategies = _strategies
.Where(s => s.CanRecover(context))
.OrderBy(s => GetStrategyPriority(s, context));
// Try each strategy until one succeeds
foreach (var strategy in applicableStrategies)
{
Logger.Info($"Attempting recovery with {strategy.Name}");
try
{
var result = await strategy.RecoverAsync(context);
if (result.Success)
{
_history.RecordSuccess(context, strategy);
await Messages.PublishAsync("recovery.completed", new
{
Failure = context,
Strategy = strategy.Name,
Duration = result.Duration
});
return;
}
}
catch (Exception ex)
{
Logger.Error($"Recovery strategy {strategy.Name} failed", ex);
}
}
// All strategies failed
await EscalateFailure(context);
}
}
// Example recovery strategies
public class RestartRecovery : IRecoveryStrategy
{
public string Name => "Restart";
public bool CanRecover(FailureContext context)
{
return context.FailureType == FailureType.Crash ||
context.FailureType == FailureType.Hang;
}
public async Task RecoverAsync(FailureContext context)
{
var stopwatch = Stopwatch.StartNew();
// Stop the failed module
await context.Module.StopAsync();
// Clean up resources
await context.Module.CleanupAsync();
// Wait for cleanup to complete
await Task.Delay(TimeSpan.FromSeconds(5));
// Restart the module
await context.Module.StartAsync();
// Verify it's healthy
var healthCheck = await context.Module.CheckHealthAsync();
return new RecoveryResult
{
Success = healthCheck.IsHealthy,
Duration = stopwatch.Elapsed
};
}
}
public class LoadSheddingRecovery : IRecoveryStrategy
{
public string Name => "Load Shedding";
public bool CanRecover(FailureContext context)
{
return context.FailureType == FailureType.Overload ||
context.Metrics?.CpuUsage > 90 ||
context.Metrics?.MemoryUsage > 85;
}
public async Task RecoverAsync(FailureContext context)
{
var stopwatch = Stopwatch.StartNew();
// Identify non-critical operations
var operations = await context.Module.GetOperationsAsync();
var nonCritical = operations
.Where(op => op.Priority == Priority.Low)
.ToList();
// Temporarily disable non-critical operations
foreach (var op in nonCritical)
{
await op.DisableAsync(TimeSpan.FromMinutes(10));
}
// Reduce processing rate
await context.Module.SetProcessingRateAsync(0.5);
// Monitor recovery
await Task.Delay(TimeSpan.FromSeconds(30));
var metrics = await context.Module.GetMetricsAsync();
var recovered = metrics.CpuUsage < 70 && metrics.MemoryUsage < 70;
if (recovered)
{
// Gradually restore operations
_ = Task.Run(async () =>
{
await Task.Delay(TimeSpan.FromMinutes(5));
await RestoreOperationsGradually(context.Module, nonCritical);
});
}
return new RecoveryResult
{
Success = recovered,
Duration = stopwatch.Elapsed
};
}
}
State Persistence and Recovery
Maintain module state to enable quick recovery after failures.
public abstract class StatefulModule : ModuleBase
{
private readonly IStateStore _stateStore;
private readonly TimeSpan _checkpointInterval;
private Timer _checkpointTimer;
protected StatefulModule(IStateStore stateStore)
{
_stateStore = stateStore;
_checkpointInterval = TimeSpan.FromMinutes(5);
}
protected override async Task OnInitializeAsync()
{
// Restore state from last checkpoint
var lastState = await _stateStore.GetLatestAsync(ModuleId);
if (lastState != null)
{
await RestoreStateAsync(lastState);
Logger.Info("Restored from checkpoint: " + lastState.Timestamp);
}
// Start periodic checkpointing
_checkpointTimer = new Timer(
async _ => await CheckpointStateAsync(),
null,
_checkpointInterval,
_checkpointInterval
);
// Subscribe to critical events for immediate checkpointing
await Messages.SubscribeAsync(
"critical.state.change",
async msg => await CheckpointStateAsync()
);
}
private async Task CheckpointStateAsync()
{
try
{
var state = await CaptureStateAsync();
await _stateStore.SaveAsync(new StateCheckpoint
{
ModuleId = ModuleId,
Timestamp = DateTime.UtcNow,
State = state,
Version = GetStateVersion()
});
}
catch (Exception ex)
{
Logger.Error("Failed to checkpoint state", ex);
}
}
protected abstract Task CaptureStateAsync();
protected abstract Task RestoreStateAsync(StateCheckpoint checkpoint);
protected abstract string GetStateVersion();
}
// Example implementation
public class DataProcessingModule : StatefulModule
{
private readonly Queue _workQueue;
private readonly Dictionary _processingStates;
private long _processedCount;
protected override async Task CaptureStateAsync()
{
return new ModuleState
{
WorkQueue = _workQueue.ToList(),
ProcessingStates = new Dictionary(
_processingStates
),
ProcessedCount = _processedCount,
Metadata = new Dictionary
{
["LastProcessedTime"] = DateTime.UtcNow,
["QueueDepth"] = _workQueue.Count
}
};
}
protected override async Task RestoreStateAsync(StateCheckpoint checkpoint)
{
var state = checkpoint.State;
// Restore work queue
_workQueue.Clear();
foreach (var item in state.WorkQueue)
{
_workQueue.Enqueue(item);
}
// Restore processing states
_processingStates.Clear();
foreach (var kvp in state.ProcessingStates)
{
_processingStates[kvp.Key] = kvp.Value;
}
_processedCount = state.ProcessedCount;
// Resume processing
await ResumeProcessingAsync();
}
}
Coordination and Orchestration
Distributed Recovery Coordination
Coordinate recovery efforts across multiple modules to maintain system consistency.
public class DistributedRecoveryCoordinator : ModuleBase
{
private readonly IConsensusService _consensus;
private readonly Dictionary _moduleHealth;
private readonly RecoveryPlan _currentPlan;
protected override async Task OnInitializeAsync()
{
// Join consensus group for recovery decisions
await _consensus.JoinGroupAsync("recovery.coordinators");
// Subscribe to health updates from all modules
await Messages.SubscribeAsync("health.status.*", UpdateModuleHealth);
// Subscribe to recovery requests
await Messages.SubscribeAsync("recovery.requested", HandleRecoveryRequest);
}
private async Task HandleRecoveryRequest(Message message)
{
var request = message.GetPayload();
// Check if we're already handling a recovery
if (_currentPlan != null && !_currentPlan.IsComplete)
{
// Queue or reject based on priority
if (request.Priority > _currentPlan.Priority)
{
await AbortCurrentRecovery();
}
else
{
await QueueRecoveryRequest(request);
return;
}
}
// Propose recovery plan to consensus group
var plan = await CreateRecoveryPlan(request);
var proposal = new RecoveryProposal
{
Plan = plan,
ProposerId = ModuleId,
Timestamp = DateTime.UtcNow
};
// Achieve consensus on recovery plan
var approved = await _consensus.ProposeAsync(
"recovery.plan",
proposal,
TimeSpan.FromSeconds(30)
);
if (approved)
{
await ExecuteRecoveryPlan(plan);
}
}
private async Task CreateRecoveryPlan(RecoveryRequest request)
{
var plan = new RecoveryPlan
{
Id = Guid.NewGuid(),
Priority = request.Priority,
Steps = new List()
};
// Analyze dependencies
var dependencies = await AnalyzeDependencies(request.FailedModule);
// Create recovery steps in order
// 1. Isolate failed module
plan.Steps.Add(new RecoveryStep
{
Order = 1,
Action = RecoveryAction.Isolate,
Target = request.FailedModule,
Timeout = TimeSpan.FromSeconds(30)
});
// 2. Stop dependent modules
foreach (var dep in dependencies.Downstream)
{
plan.Steps.Add(new RecoveryStep
{
Order = 2,
Action = RecoveryAction.Pause,
Target = dep,
Timeout = TimeSpan.FromSeconds(60)
});
}
// 3. Recover failed module
plan.Steps.Add(new RecoveryStep
{
Order = 3,
Action = RecoveryAction.Recover,
Target = request.FailedModule,
Strategy = DetermineRecoveryStrategy(request),
Timeout = TimeSpan.FromMinutes(5)
});
// 4. Restore dependent modules
foreach (var dep in dependencies.Downstream.Reverse())
{
plan.Steps.Add(new RecoveryStep
{
Order = 4,
Action = RecoveryAction.Resume,
Target = dep,
Timeout = TimeSpan.FromSeconds(60)
});
}
return plan;
}
private async Task ExecuteRecoveryPlan(RecoveryPlan plan)
{
_currentPlan = plan;
await Messages.PublishAsync("recovery.started", plan);
foreach (var step in plan.Steps.OrderBy(s => s.Order))
{
try
{
await ExecuteRecoveryStep(step);
step.Status = StepStatus.Completed;
step.CompletedAt = DateTime.UtcNow;
await Messages.PublishAsync("recovery.step.completed", step);
}
catch (Exception ex)
{
step.Status = StepStatus.Failed;
step.Error = ex.Message;
Logger.Error($"Recovery step failed: {step.Action} on {step.Target}", ex);
// Decide whether to continue or abort
if (step.IsCritical)
{
await AbortRecovery(plan, $"Critical step failed: {step.Action}");
return;
}
}
}
plan.IsComplete = true;
await Messages.PublishAsync("recovery.completed", plan);
}
}
Real-World Examples
Example 1: Self-Healing Data Pipeline
A data processing pipeline that automatically recovers from various failure scenarios.
public class SelfHealingDataPipeline : ModuleBase
{
private readonly PipelineStageManager _stageManager;
private readonly HealthMonitor _healthMonitor;
private readonly RecoveryEngine _recoveryEngine;
protected override async Task OnInitializeAsync()
{
// Initialize pipeline stages
_stageManager.AddStage("ingestion", new DataIngestionStage());
_stageManager.AddStage("validation", new ValidationStage());
_stageManager.AddStage("transformation", new TransformationStage());
_stageManager.AddStage("storage", new StorageStage());
// Configure health monitoring
_healthMonitor.AddCheck("throughput", CheckThroughput);
_healthMonitor.AddCheck("latency", CheckLatency);
_healthMonitor.AddCheck("error_rate", CheckErrorRate);
_healthMonitor.AddCheck("backpressure", CheckBackpressure);
// Configure recovery strategies
_recoveryEngine.AddStrategy("stage_restart", new StageRestartStrategy());
_recoveryEngine.AddStrategy("buffer_clear", new BufferClearStrategy());
_recoveryEngine.AddStrategy("rate_limit", new RateLimitStrategy());
_recoveryEngine.AddStrategy("reroute", new RerouteStrategy());
// Start monitoring
_ = Task.Run(MonitorPipelineHealth);
}
private async Task MonitorPipelineHealth()
{
while (!CancellationToken.IsCancellationRequested)
{
var healthStatus = await _healthMonitor.CheckAllAsync();
if (healthStatus.HasIssues)
{
await HandleHealthIssues(healthStatus);
}
// Predictive maintenance
var predictions = await AnalyzeTrends();
if (predictions.FailureLikely)
{
await PerformPreventiveMaintenance(predictions);
}
await Task.Delay(TimeSpan.FromSeconds(10));
}
}
private async Task HandleHealthIssues(HealthStatus status)
{
foreach (var issue in status.Issues)
{
Logger.Warning($"Health issue detected: {issue.Name}");
// Determine recovery strategy
var strategy = _recoveryEngine.SelectStrategy(issue);
try
{
// Execute recovery
var result = await strategy.ExecuteAsync(new RecoveryContext
{
Issue = issue,
Pipeline = this,
StageManager = _stageManager
});
if (result.Success)
{
Logger.Info($"Recovery successful: {strategy.Name}");
await Messages.PublishAsync("recovery.success", new
{
Issue = issue.Name,
Strategy = strategy.Name,
Duration = result.Duration
});
}
else
{
// Escalate if recovery failed
await EscalateIssue(issue, result);
}
}
catch (Exception ex)
{
Logger.Error($"Recovery failed for {issue.Name}", ex);
await EscalateIssue(issue, ex);
}
}
}
// Example recovery strategy implementation
public class StageRestartStrategy : IRecoveryStrategy
{
public async Task ExecuteAsync(RecoveryContext context)
{
var stopwatch = Stopwatch.StartNew();
// Identify problematic stage
var stage = context.StageManager.GetStage(context.Issue.Component);
// Graceful shutdown
await stage.DrainAsync(TimeSpan.FromSeconds(30));
await stage.StopAsync();
// Clear any corrupted state
await stage.ClearStateAsync();
// Restart with reduced capacity
stage.Configuration.MaxThroughput *= 0.7;
await stage.StartAsync();
// Gradually increase capacity
_ = Task.Run(async () =>
{
await Task.Delay(TimeSpan.FromMinutes(5));
stage.Configuration.MaxThroughput /= 0.7;
});
return new RecoveryResult
{
Success = true,
Duration = stopwatch.Elapsed
};
}
}
}
Example 2: Self-Healing Microservices Mesh
A microservices system with automatic service discovery and healing.
public class SelfHealingServiceMesh : ModuleBase
{
private readonly ServiceRegistry _registry;
private readonly LoadBalancer _loadBalancer;
private readonly Dictionary _circuitBreakers;
public SelfHealingServiceMesh()
{
_registry = new ServiceRegistry();
_loadBalancer = new LoadBalancer();
_circuitBreakers = new Dictionary();
}
protected override async Task OnInitializeAsync()
{
// Start service health monitoring
_ = Task.Run(MonitorServiceHealth);
// Subscribe to service events
await Messages.SubscribeAsync("service.registered", OnServiceRegistered);
await Messages.SubscribeAsync("service.failed", OnServiceFailed);
}
private async Task MonitorServiceHealth()
{
while (!CancellationToken.IsCancellationRequested)
{
var services = await _registry.GetAllServicesAsync();
var healthChecks = services.Select(async service =>
{
var health = await CheckServiceHealth(service);
if (!health.IsHealthy)
{
await HandleUnhealthyService(service, health);
}
return new { Service = service, Health = health };
});
var results = await Task.WhenAll(healthChecks);
// Update load balancer weights based on health
UpdateLoadBalancerWeights(results);
await Task.Delay(TimeSpan.FromSeconds(5));
}
}
private async Task HandleUnhealthyService(
ServiceInstance service,
HealthCheckResult health)
{
// Get or create circuit breaker for this service
if (!_circuitBreakers.TryGetValue(service.Id, out var circuitBreaker))
{
circuitBreaker = Recovery.CreateCircuitBreaker(
name: $"service-{service.Id}",
failureThreshold: 3,
samplingDuration: TimeSpan.FromMinutes(1),
breakDuration: TimeSpan.FromMinutes(2)
);
_circuitBreakers[service.Id] = circuitBreaker;
}
// The SDK's circuit breaker automatically tracks failures
// when exceptions occur during ExecuteAsync calls
if (circuitBreaker.State == CircuitState.Open)
{
// Remove from load balancer rotation
await _loadBalancer.RemoveInstanceAsync(service);
// Attempt recovery
await AttemptServiceRecovery(service);
}
}
private async Task AttemptServiceRecovery(ServiceInstance service)
{
Logger.Info($"Attempting recovery for service: {service.Id}");
// Try different recovery strategies
var strategies = new[]
{
() => RestartService(service),
() => RedeployService(service),
() => ScaleUpService(service),
() => MigrateService(service)
};
foreach (var strategy in strategies)
{
try
{
var success = await strategy();
if (success)
{
Logger.Info($"Service {service.Id} recovered");
// Add back to load balancer with reduced weight
await _loadBalancer.AddInstanceAsync(
service,
weight: 0.1
);
// Gradually increase weight
_ = Task.Run(async () =>
{
await GraduallyRestoreService(service);
});
return;
}
}
catch (Exception ex)
{
Logger.Error($"Recovery strategy failed for {service.Id}", ex);
}
}
// All strategies failed - escalate
await EscalateServiceFailure(service);
}
private async Task GraduallyRestoreService(ServiceInstance service)
{
var weights = new[] { 0.25, 0.5, 0.75, 1.0 };
foreach (var weight in weights)
{
await Task.Delay(TimeSpan.FromMinutes(5));
// Check if service is still healthy
var health = await CheckServiceHealth(service);
if (!health.IsHealthy)
{
Logger.Warning($"Service {service.Id} degraded during restoration");
return;
}
// Increase weight
await _loadBalancer.UpdateWeightAsync(service, weight);
Logger.Info($"Service {service.Id} weight increased to {weight}");
}
}
}
Best Practices
Self-Healing System Guidelines
- Design for Failure: Assume components will fail and plan recovery strategies
- Health Checks First: Implement thorough health monitoring before recovery
- Gradual Recovery: Restore services gradually to avoid overwhelming the system
- Avoid Recovery Loops: Track recovery attempts and escalate if stuck
- State Management: Maintain consistent state through failures and recovery
- Test Recovery Paths: Regularly test recovery mechanisms in controlled environments
- Monitor Recovery: Track recovery metrics and success rates
- Human Escalation: Know when to involve human operators
- Document Patterns: Record failure patterns and successful recovery strategies
- Resource Limits: Set boundaries on resource usage during recovery
Graceful Degradation
Build systems that maintain essential functionality under adverse conditions. The NEXUS-1 SDK enables modules to degrade gracefully, providing the best possible service level when full functionality cannot be maintained.
Overview
Why Graceful Degradation?
In real-world systems, maintaining some functionality is often better than complete failure:
- Partial Availability: Core features remain accessible even when optional features fail
- Resource Conservation: Prioritize critical operations when resources are limited
- User Experience: Provide degraded but usable service rather than error messages
- System Stability: Prevent cascading failures by reducing load proactively
- Business Continuity: Maintain revenue-generating operations during incidents
Degradation Strategies
Feature Toggle Pattern
Dynamically enable or disable features based on system health and resource availability.
public class GracefulDegradationModule : ModuleBase
{
private readonly FeatureManager _featureManager;
private readonly ResourceMonitor _resourceMonitor;
private DegradationLevel _currentLevel = DegradationLevel.Full;
public GracefulDegradationModule()
{
_featureManager = new FeatureManager();
_resourceMonitor = new ResourceMonitor();
// Define features with priorities
_featureManager.RegisterFeature("real-time-analytics",
priority: FeaturePriority.Optional,
resourceCost: ResourceCost.High);
_featureManager.RegisterFeature("data-validation",
priority: FeaturePriority.Important,
resourceCost: ResourceCost.Medium);
_featureManager.RegisterFeature("core-processing",
priority: FeaturePriority.Critical,
resourceCost: ResourceCost.Low);
}
protected override async Task OnInitializeAsync()
{
// Start resource monitoring
_ = Task.Run(MonitorResourcesAsync);
// Subscribe to system health events
await Messages.SubscribeAsync("system.health", HandleHealthUpdate);
}
private async Task MonitorResourcesAsync()
{
while (!CancellationToken.IsCancellationRequested)
{
var resources = await _resourceMonitor.GetCurrentResourcesAsync();
var newLevel = DetermineDegradationLevel(resources);
if (newLevel != _currentLevel)
{
await TransitionToLevel(newLevel);
}
await Task.Delay(TimeSpan.FromSeconds(30));
}
}
private DegradationLevel DetermineDegradationLevel(ResourceStatus resources)
{
// CPU critical
if (resources.CpuUsage > 90)
return DegradationLevel.Essential;
// Memory pressure
if (resources.MemoryUsage > 85)
return DegradationLevel.Reduced;
// High load but manageable
if (resources.CpuUsage > 70 || resources.MemoryUsage > 70)
return DegradationLevel.Limited;
// Normal operation
return DegradationLevel.Full;
}
private async Task TransitionToLevel(DegradationLevel newLevel)
{
Logger.Info($"Transitioning from {_currentLevel} to {newLevel}");
var previousLevel = _currentLevel;
_currentLevel = newLevel;
// Update feature availability
switch (newLevel)
{
case DegradationLevel.Full:
_featureManager.EnableAll();
break;
case DegradationLevel.Limited:
_featureManager.DisableFeatures(FeaturePriority.Optional);
break;
case DegradationLevel.Reduced:
_featureManager.DisableFeatures(
FeaturePriority.Optional,
FeaturePriority.Important);
break;
case DegradationLevel.Essential:
_featureManager.EnableOnlyFeatures(FeaturePriority.Critical);
break;
}
// Notify other modules
await Messages.PublishAsync("degradation.level.changed", new
{
PreviousLevel = previousLevel,
CurrentLevel = newLevel,
EnabledFeatures = _featureManager.GetEnabledFeatures(),
Timestamp = DateTime.UtcNow
});
}
public async Task ExecuteWithDegradation(
string feature,
Func> operation,
Func> fallback = null)
{
if (_featureManager.IsEnabled(feature))
{
try
{
return await operation();
}
catch (Exception ex) when (fallback != null)
{
Logger.Warning($"Feature {feature} failed, using fallback", ex);
return await fallback();
}
}
else if (fallback != null)
{
return await fallback();
}
else
{
throw new FeatureDisabledException(feature, _currentLevel);
}
}
}
from nexus_sdk import Module
from enum import Enum, auto
from typing import Dict, List, Callable, Optional
import asyncio
import psutil
class FeaturePriority(Enum):
CRITICAL = auto()
IMPORTANT = auto()
OPTIONAL = auto()
class DegradationLevel(Enum):
FULL = auto()
LIMITED = auto()
REDUCED = auto()
ESSENTIAL = auto()
class GracefulDegradationModule(Module):
def __init__(self):
super().__init__()
self.features = {}
self.current_level = DegradationLevel.FULL
self.degradation_policies = {}
self.fallback_handlers = {}
async def on_initialize(self):
# Register features
self.register_feature(
"ml_predictions",
priority=FeaturePriority.OPTIONAL,
resource_weight=0.3
)
self.register_feature(
"data_enrichment",
priority=FeaturePriority.IMPORTANT,
resource_weight=0.2
)
self.register_feature(
"core_processing",
priority=FeaturePriority.CRITICAL,
resource_weight=0.1
)
# Define degradation policies
self.define_degradation_policy()
# Start monitoring
asyncio.create_task(self.monitor_and_adapt())
def register_feature(self, name: str, priority: FeaturePriority,
resource_weight: float):
self.features[name] = {
"priority": priority,
"resource_weight": resource_weight,
"enabled": True,
"fallback": None
}
def register_fallback(self, feature: str, fallback: Callable):
if feature in self.features:
self.features[feature]["fallback"] = fallback
def define_degradation_policy(self):
self.degradation_policies = {
DegradationLevel.FULL: {
"cpu_threshold": 70,
"memory_threshold": 70,
"enabled_priorities": [
FeaturePriority.CRITICAL,
FeaturePriority.IMPORTANT,
FeaturePriority.OPTIONAL
]
},
DegradationLevel.LIMITED: {
"cpu_threshold": 80,
"memory_threshold": 80,
"enabled_priorities": [
FeaturePriority.CRITICAL,
FeaturePriority.IMPORTANT
]
},
DegradationLevel.REDUCED: {
"cpu_threshold": 90,
"memory_threshold": 85,
"enabled_priorities": [
FeaturePriority.CRITICAL
]
},
DegradationLevel.ESSENTIAL: {
"cpu_threshold": 95,
"memory_threshold": 90,
"enabled_priorities": [
FeaturePriority.CRITICAL
],
"additional_measures": ["rate_limiting", "queue_shedding"]
}
}
async def monitor_and_adapt(self):
while not self.shutdown_requested:
# Get current resource usage
cpu_percent = psutil.cpu_percent(interval=1)
memory = psutil.virtual_memory()
# Determine appropriate level
new_level = self.calculate_degradation_level(
cpu_percent,
memory.percent
)
if new_level != self.current_level:
await self.transition_to_level(new_level)
await asyncio.sleep(10)
def calculate_degradation_level(self, cpu: float, memory: float) -> DegradationLevel:
levels = [
(DegradationLevel.ESSENTIAL, 95, 90),
(DegradationLevel.REDUCED, 90, 85),
(DegradationLevel.LIMITED, 80, 80),
(DegradationLevel.FULL, 0, 0)
]
for level, cpu_threshold, mem_threshold in levels:
if cpu >= cpu_threshold or memory >= mem_threshold:
return level
return DegradationLevel.FULL
async def transition_to_level(self, new_level: DegradationLevel):
self.logger.info(f"Transitioning from {self.current_level} to {new_level}")
old_level = self.current_level
self.current_level = new_level
# Update feature states
policy = self.degradation_policies[new_level]
enabled_priorities = policy["enabled_priorities"]
for feature_name, feature_info in self.features.items():
should_enable = feature_info["priority"] in enabled_priorities
feature_info["enabled"] = should_enable
# Apply additional measures if needed
if "additional_measures" in policy:
await self.apply_additional_measures(policy["additional_measures"])
# Notify system
await self.messages.publish("degradation.level.changed", {
"previous_level": old_level.name,
"current_level": new_level.name,
"enabled_features": [
name for name, info in self.features.items()
if info["enabled"]
],
"timestamp": self.get_timestamp()
})
async def execute_with_degradation(self, feature: str,
operation: Callable,
*args, **kwargs):
feature_info = self.features.get(feature)
if not feature_info:
raise ValueError(f"Unknown feature: {feature}")
if feature_info["enabled"]:
try:
return await operation(*args, **kwargs)
except Exception as e:
if feature_info["fallback"]:
self.logger.warning(
f"Feature {feature} failed, using fallback: {e}"
)
return await feature_info["fallback"](*args, **kwargs)
raise
else:
# Feature disabled due to degradation
if feature_info["fallback"]:
return await feature_info["fallback"](*args, **kwargs)
else:
raise FeatureDisabledException(
f"Feature {feature} disabled at level {self.current_level}"
)
async def apply_additional_measures(self, measures: List[str]):
for measure in measures:
if measure == "rate_limiting":
await self.enable_rate_limiting()
elif measure == "queue_shedding":
await self.enable_queue_shedding()
elif measure == "cache_only":
await self.enable_cache_only_mode()
#include <nexus/module.hpp>
#include <atomic>
#include <unordered_map>
enum class FeaturePriority {
Critical = 0,
Important = 1,
Optional = 2
};
enum class DegradationLevel {
Full,
Limited,
Reduced,
Essential
};
class GracefulDegradationModule : public nexus::ModuleBase {
private:
struct Feature {
std::string name;
FeaturePriority priority;
std::atomic<bool> enabled{true};
double resource_cost;
std::function<void()> disable_callback;
};
std::unordered_map<std::string, Feature> features_;
std::atomic<DegradationLevel> current_level_{DegradationLevel::Full};
std::mutex feature_mutex_;
public:
async_task<void> on_initialize() override {
// Register features
register_feature("heavy_computation",
FeaturePriority::Optional, 0.4);
register_feature("logging_verbose",
FeaturePriority::Optional, 0.1);
register_feature("caching",
FeaturePriority::Important, 0.2);
register_feature("core_logic",
FeaturePriority::Critical, 0.1);
// Start monitoring thread
std::thread([this] { monitor_resources(); }).detach();
co_return;
}
void register_feature(const std::string& name,
FeaturePriority priority,
double resource_cost) {
std::lock_guard<std::mutex> lock(feature_mutex_);
features_[name] = Feature{
name, priority, true, resource_cost, nullptr
};
}
template<typename T>
async_task<T> execute_with_degradation(
const std::string& feature,
std::function<async_task<T>()> operation,
std::function<async_task<T>()> fallback = nullptr) {
auto it = features_.find(feature);
if (it == features_.end()) {
throw std::runtime_error("Unknown feature: " + feature);
}
if (it->second.enabled.load()) {
try {
co_return co_await operation();
} catch (const std::exception& e) {
if (fallback) {
logger()->warn("Feature {} failed: {}, using fallback",
feature, e.what());
co_return co_await fallback();
}
throw;
}
} else if (fallback) {
co_return co_await fallback();
} else {
throw std::runtime_error(
"Feature " + feature + " disabled at level " +
to_string(current_level_.load())
);
}
}
private:
void monitor_resources() {
while (!is_shutting_down()) {
auto resources = get_system_resources();
auto new_level = calculate_degradation_level(resources);
if (new_level != current_level_.load()) {
transition_to_level(new_level);
}
std::this_thread::sleep_for(std::chrono::seconds(10));
}
}
DegradationLevel calculate_degradation_level(
const SystemResources& resources) {
if (resources.cpu_usage > 90 || resources.memory_usage > 90) {
return DegradationLevel::Essential;
} else if (resources.cpu_usage > 80 || resources.memory_usage > 85) {
return DegradationLevel::Reduced;
} else if (resources.cpu_usage > 70 || resources.memory_usage > 70) {
return DegradationLevel::Limited;
}
return DegradationLevel::Full;
}
void transition_to_level(DegradationLevel new_level) {
logger()->info("Transitioning from {} to {}",
to_string(current_level_.load()),
to_string(new_level));
current_level_ = new_level;
// Update feature states based on level
std::lock_guard<std::mutex> lock(feature_mutex_);
for (auto& [name, feature] : features_) {
bool should_enable = should_feature_be_enabled(
feature.priority, new_level
);
if (feature.enabled != should_enable) {
feature.enabled = should_enable;
if (!should_enable && feature.disable_callback) {
feature.disable_callback();
}
}
}
// Publish state change
publish_degradation_change(new_level);
}
bool should_feature_be_enabled(FeaturePriority priority,
DegradationLevel level) {
switch (level) {
case DegradationLevel::Full:
return true;
case DegradationLevel::Limited:
return priority != FeaturePriority::Optional;
case DegradationLevel::Reduced:
return priority == FeaturePriority::Critical;
case DegradationLevel::Essential:
return priority == FeaturePriority::Critical;
}
return false;
}
};
Load Shedding Pattern
Proactively drop non-critical work to maintain system stability.
public class LoadSheddingModule : ModuleBase
{
private readonly PriorityQueue _workQueue;
private readonly LoadMonitor _loadMonitor;
private readonly SheddingPolicy _policy;
public LoadSheddingModule()
{
_workQueue = new PriorityQueue();
_loadMonitor = new LoadMonitor();
_policy = new SheddingPolicy
{
CriticalThreshold = 0.9,
HighThreshold = 0.7,
NormalThreshold = 0.5
};
}
protected override async Task OnMessageAsync(Message message)
{
var workItem = new WorkItem
{
Id = Guid.NewGuid(),
Priority = DeterminePriority(message),
Message = message,
ReceivedAt = DateTime.UtcNow
};
// Check if we should accept this work
var currentLoad = await _loadMonitor.GetCurrentLoadAsync();
if (ShouldShedWork(workItem, currentLoad))
{
await HandleShedWork(workItem);
return;
}
// Add to queue with possible eviction
await EnqueueWithEviction(workItem, currentLoad);
}
private bool ShouldShedWork(WorkItem item, LoadStatus load)
{
if (load.Percentage >= _policy.CriticalThreshold)
{
// Only accept critical work
return item.Priority != Priority.Critical;
}
else if (load.Percentage >= _policy.HighThreshold)
{
// Shed low priority work
return item.Priority == Priority.Low;
}
return false;
}
private async Task EnqueueWithEviction(WorkItem newItem, LoadStatus load)
{
// If queue is at capacity, consider eviction
if (_workQueue.Count >= _policy.MaxQueueSize)
{
// Find lowest priority item that's lower than new item
var victim = _workQueue.PeekLowestPriority();
if (victim != null && victim.Priority < newItem.Priority)
{
_workQueue.Remove(victim);
await HandleEvictedWork(victim);
Logger.Info($"Evicted work item {victim.Id} for {newItem.Id}");
}
else
{
// New item is lowest priority, shed it
await HandleShedWork(newItem);
return;
}
}
_workQueue.Enqueue(newItem);
// Update metrics
Telemetry.RecordGauge("queue_depth", _workQueue.Count);
Telemetry.RecordGauge("queue_load", load.Percentage);
}
private async Task HandleShedWork(WorkItem item)
{
Logger.Warning($"Shedding work item {item.Id} due to load");
// Send degraded response
await Messages.PublishAsync(item.Message.ReplyTo, new
{
Status = "ServiceDegraded",
Message = "Request dropped due to high load",
RetryAfter = TimeSpan.FromSeconds(30),
AlternativeEndpoint = GetAlternativeEndpoint()
});
// Record metrics
Telemetry.IncrementCounter("work_items_shed",
new[] { "priority", item.Priority.ToString() });
}
}
Progressive Degradation Levels
Multi-Level Degradation Strategy
Define multiple degradation levels with specific behaviors and transitions.
public interface IDegradationStrategy
{
DegradationLevel Level { get; }
Task ActivateAsync();
Task DeactivateAsync();
bool ShouldTransitionTo(SystemMetrics metrics);
}
public class ProgressiveDegradationManager : ModuleBase
{
private readonly List _strategies;
private IDegradationStrategy _currentStrategy;
private readonly SystemMetricsCollector _metricsCollector;
public ProgressiveDegradationManager()
{
_strategies = new List
{
new FullFunctionalityStrategy(),
new ReducedAccuracyStrategy(),
new CacheOnlyStrategy(),
new ReadOnlyStrategy(),
new MaintenanceModeStrategy()
};
_metricsCollector = new SystemMetricsCollector();
_currentStrategy = _strategies.First();
}
protected override async Task OnInitializeAsync()
{
// Start with full functionality
await _currentStrategy.ActivateAsync();
// Monitor and adapt
_ = Task.Run(MonitorAndAdaptAsync);
}
private async Task MonitorAndAdaptAsync()
{
while (!CancellationToken.IsCancellationRequested)
{
var metrics = await _metricsCollector.CollectAsync();
// Check if we need to change strategy
var newStrategy = DetermineOptimalStrategy(metrics);
if (newStrategy != _currentStrategy)
{
await TransitionToStrategy(newStrategy);
}
await Task.Delay(TimeSpan.FromSeconds(15));
}
}
private IDegradationStrategy DetermineOptimalStrategy(SystemMetrics metrics)
{
// Try to upgrade if possible
for (int i = 0; i < _strategies.Count - 1; i++)
{
if (_strategies[i].ShouldTransitionTo(metrics))
{
return _strategies[i];
}
}
// Otherwise, we need the most degraded state
return _strategies.Last();
}
private async Task TransitionToStrategy(IDegradationStrategy newStrategy)
{
Logger.Info($"Transitioning from {_currentStrategy.Level} to {newStrategy.Level}");
// Deactivate current
await _currentStrategy.DeactivateAsync();
// Activate new
await newStrategy.ActivateAsync();
_currentStrategy = newStrategy;
// Notify system
await Messages.PublishAsync("degradation.strategy.changed", new
{
Strategy = newStrategy.GetType().Name,
Level = newStrategy.Level,
Timestamp = DateTime.UtcNow
});
}
}
// Example strategies
public class ReducedAccuracyStrategy : IDegradationStrategy
{
public DegradationLevel Level => DegradationLevel.ReducedAccuracy;
public async Task ActivateAsync()
{
// Switch to faster, less accurate algorithms
AlgorithmRegistry.SetMode(AlgorithmMode.Fast);
// Reduce sampling rates
SamplingConfig.SetRate(0.1); // 10% sampling
// Disable expensive validations
ValidationConfig.DisableLevel(ValidationLevel.Deep);
await Task.CompletedTask;
}
public async Task DeactivateAsync()
{
// Restore normal operations
AlgorithmRegistry.SetMode(AlgorithmMode.Accurate);
SamplingConfig.SetRate(1.0);
ValidationConfig.EnableAll();
await Task.CompletedTask;
}
public bool ShouldTransitionTo(SystemMetrics metrics)
{
return metrics.CpuUsage < 70 &&
metrics.MemoryUsage < 70 &&
metrics.ResponseTime95th < TimeSpan.FromMilliseconds(500);
}
}
public class CacheOnlyStrategy : IDegradationStrategy
{
public DegradationLevel Level => DegradationLevel.CacheOnly;
public async Task ActivateAsync()
{
// Disable all external calls
ExternalServices.DisableAll();
// Enable aggressive caching
CacheConfig.SetMode(CacheMode.Aggressive);
CacheConfig.SetTTL(TimeSpan.FromHours(24));
// Return stale data if necessary
CacheConfig.AllowStaleData = true;
await Task.CompletedTask;
}
public async Task DeactivateAsync()
{
ExternalServices.EnableAll();
CacheConfig.SetMode(CacheMode.Normal);
CacheConfig.AllowStaleData = false;
await Task.CompletedTask;
}
public bool ShouldTransitionTo(SystemMetrics metrics)
{
// Only if system is stable with cache-only mode
return metrics.CpuUsage < 50 &&
metrics.MemoryUsage < 60 &&
metrics.CacheHitRate > 0.8;
}
}
Quality of Service Management
Dynamic QoS Adjustment
Adjust service quality based on system capacity and client priorities.
public class QualityOfServiceManager : ModuleBase
{
private readonly Dictionary _clientProfiles;
private readonly ResourceAllocator _resourceAllocator;
private QoSLevel _systemQoS = QoSLevel.Premium;
protected override async Task OnInitializeAsync()
{
// Load client profiles
_clientProfiles = await LoadClientProfilesAsync();
// Subscribe to resource events
await Messages.SubscribeAsync("resource.availability", UpdateQoS);
}
public async Task ExecuteWithQoS(
string clientId,
ServiceRequest request,
Func> operation)
{
var profile = GetClientProfile(clientId);
var qosParams = CalculateQoSParameters(profile, request);
// Apply QoS constraints
using (var lease = await _resourceAllocator.AcquireAsync(qosParams))
{
// Set operation constraints
var constrainedOperation = ApplyConstraints(operation, qosParams);
try
{
return await constrainedOperation(qosParams);
}
finally
{
// Record QoS metrics
await RecordQoSMetrics(clientId, qosParams, lease);
}
}
}
private QoSParameters CalculateQoSParameters(
ClientProfile profile,
ServiceRequest request)
{
var baseParams = new QoSParameters
{
Priority = profile.Tier,
MaxLatency = profile.SLA.MaxLatency,
MinThroughput = profile.SLA.MinThroughput,
MaxResourceUsage = profile.ResourceQuota
};
// Adjust based on system QoS level
switch (_systemQoS)
{
case QoSLevel.Premium:
return baseParams;
case QoSLevel.Standard:
baseParams.MaxLatency *= 1.5;
baseParams.MinThroughput *= 0.8;
baseParams.MaxResourceUsage *= 0.7;
break;
case QoSLevel.Economy:
baseParams.MaxLatency *= 2.0;
baseParams.MinThroughput *= 0.5;
baseParams.MaxResourceUsage *= 0.5;
break;
case QoSLevel.BestEffort:
baseParams.MaxLatency = TimeSpan.FromSeconds(30);
baseParams.MinThroughput = 0;
baseParams.MaxResourceUsage *= 0.2;
break;
}
return baseParams;
}
private Func> ApplyConstraints(
Func> operation,
QoSParameters qos)
{
return async (parameters) =>
{
using (var cts = new CancellationTokenSource(qos.MaxLatency))
{
// Apply rate limiting
await _rateLimiter.WaitAsync(qos.Priority, cts.Token);
// Apply resource constraints
using (var resourceGuard = new ResourceGuard(qos.MaxResourceUsage))
{
return await operation(parameters);
}
}
};
}
}
// Adaptive rate limiting based on QoS
public class AdaptiveRateLimiter : ModuleBase
{
private readonly Dictionary _buckets;
private readonly SystemLoadMonitor _loadMonitor;
public AdaptiveRateLimiter()
{
_buckets = new Dictionary
{
[Priority.Critical] = new TokenBucket(1000, 100), // 1000 tokens, 100/sec
[Priority.High] = new TokenBucket(500, 50),
[Priority.Normal] = new TokenBucket(200, 20),
[Priority.Low] = new TokenBucket(50, 5)
};
}
protected override async Task OnInitializeAsync()
{
_ = Task.Run(AdaptRateLimitsAsync);
}
private async Task AdaptRateLimitsAsync()
{
while (!CancellationToken.IsCancellationRequested)
{
var load = await _loadMonitor.GetLoadAsync();
// Adjust rate limits based on load
foreach (var (priority, bucket) in _buckets)
{
var adjustment = CalculateAdjustment(priority, load);
bucket.AdjustRate(adjustment);
}
await Task.Delay(TimeSpan.FromSeconds(10));
}
}
private double CalculateAdjustment(Priority priority, SystemLoad load)
{
// Protect critical traffic during high load
if (load.Level == LoadLevel.Critical)
{
return priority == Priority.Critical ? 1.0 : 0.1;
}
else if (load.Level == LoadLevel.High)
{
return priority switch
{
Priority.Critical => 1.0,
Priority.High => 0.5,
Priority.Normal => 0.2,
Priority.Low => 0.1,
_ => 0.1
};
}
// Normal operation
return 1.0;
}
}
Fallback Implementations
Graceful Fallback Patterns
Provide alternative implementations when primary services are unavailable.
public class FallbackService : ModuleBase
{
private readonly IPrimaryService _primary;
private readonly IFallbackService _fallback;
private readonly ICache _cache;
private readonly ICircuitBreaker _circuitBreaker;
public FallbackService()
{
_primary = new PrimaryService();
_fallback = new SimplifiedService();
_cache = new DistributedCache();
// Use SDK-provided circuit breaker
_circuitBreaker = Recovery.CreateCircuitBreaker(
name: "primary-service",
failureThreshold: 5,
samplingDuration: TimeSpan.FromMinutes(1),
breakDuration: TimeSpan.FromMinutes(1)
);
}
public async Task GetDataAsync(string key)
{
// Try cache first
var cached = await _cache.GetAsync(key);
if (cached != null)
{
Telemetry.IncrementCounter("cache_hit");
return cached;
}
// Try primary service with circuit breaker
try
{
var result = await _circuitBreaker.ExecuteAsync(
async () => await _primary.GetDataAsync(key)
);
// Cache successful result
await _cache.SetAsync(key, result, TimeSpan.FromMinutes(5));
return result;
}
catch (CircuitBreakerOpenException)
{
Logger.Warning("Circuit breaker open, using fallback");
}
catch (Exception ex)
{
Logger.Error("Primary service failed", ex);
}
// Try fallback service
try
{
var fallbackResult = await _fallback.GetSimplifiedDataAsync(key);
return new DataResult
{
Data = fallbackResult,
Quality = DataQuality.Degraded,
Source = "Fallback"
};
}
catch (Exception ex)
{
Logger.Error("Fallback service failed", ex);
}
// Last resort: return default/cached stale data
var staleData = await _cache.GetAsync(key,
ignoreExpiration: true);
if (staleData != null)
{
return new DataResult
{
Data = staleData.Data,
Quality = DataQuality.Stale,
Source = "StaleCache",
Age = staleData.Age
};
}
// Return safe default
return DataResult.Empty;
}
// Fallback for complex calculations
public async Task CalculateAsync(CalculationRequest request)
{
// Check if we can use simplified calculation
if (_systemLoad.IsHigh && request.AcceptsDegraded)
{
return await SimplifiedCalculationAsync(request);
}
try
{
return await ComplexCalculationAsync(request);
}
catch (ResourceExhaustedException)
{
if (request.AcceptsDegraded)
{
return await SimplifiedCalculationAsync(request);
}
throw;
}
}
private async Task SimplifiedCalculationAsync(
CalculationRequest request)
{
// Use approximation algorithms
var approximation = new ApproximationAlgorithm
{
ErrorTolerance = 0.05, // 5% error acceptable
MaxIterations = 100, // Limit iterations
UseCaching = true
};
var result = await approximation.CalculateAsync(request.Data);
return new CalculationResult
{
Value = result.Value,
Confidence = result.Confidence,
Method = "Approximation",
Quality = CalculationQuality.Approximate
};
}
}
from nexus_sdk import Module
from typing import Optional, Dict, Any
from functools import wraps
import asyncio
from datetime import datetime, timedelta
class FallbackService(Module):
def __init__(self):
super().__init__()
self.cache = {}
self.circuit_breakers = {}
self.fallback_strategies = {
"data_fetch": self.cached_data_fallback,
"computation": self.simplified_computation_fallback,
"external_api": self.mock_response_fallback
}
def with_fallback(operation_type: str):
"""Decorator to add fallback behavior"""
def decorator(func):
@wraps(func)
async def wrapper(self, *args, **kwargs):
# Check circuit breaker
breaker = self.get_circuit_breaker(operation_type)
if breaker.is_open:
self.logger.warning(
f"Circuit breaker open for {operation_type}"
)
return await self.execute_fallback(
operation_type, *args, **kwargs
)
try:
# Try primary operation
result = await func(self, *args, **kwargs)
breaker.record_success()
# Cache successful result
self.cache_result(operation_type, args, result)
return result
except Exception as e:
breaker.record_failure()
self.logger.error(
f"Primary operation {operation_type} failed: {e}"
)
# Execute fallback
return await self.execute_fallback(
operation_type, *args, **kwargs
)
return wrapper
return decorator
async def execute_fallback(self, operation_type: str,
*args, **kwargs):
strategy = self.fallback_strategies.get(operation_type)
if strategy:
try:
result = await strategy(*args, **kwargs)
result["degraded"] = True
result["fallback_type"] = operation_type
return result
except Exception as e:
self.logger.error(f"Fallback failed: {e}")
# Last resort - return safe default
return self.get_safe_default(operation_type)
@with_fallback("data_fetch")
async def fetch_data(self, key: str) -> Dict[str, Any]:
# Primary implementation - external service call
async with self.http_client() as client:
response = await client.get(f"/api/data/{key}")
return response.json()
async def cached_data_fallback(self, key: str) -> Dict[str, Any]:
# Try cache first
cached = self.get_from_cache("data", key)
if cached and not self.is_stale(cached, minutes=60):
return {
"data": cached["data"],
"source": "cache",
"cached_at": cached["timestamp"]
}
# Try stale cache
if cached:
return {
"data": cached["data"],
"source": "stale_cache",
"age_minutes": self.get_age_minutes(cached["timestamp"]),
"warning": "Data may be outdated"
}
# Return default
return {
"data": None,
"source": "default",
"error": "No data available"
}
@with_fallback("computation")
async def perform_computation(self, data: list) -> Dict[str, Any]:
# Primary - accurate but expensive
result = await self.ml_model.predict(data)
return {
"predictions": result.predictions,
"confidence": result.confidence,
"method": "ml_model"
}
async def simplified_computation_fallback(self, data: list) -> Dict[str, Any]:
# Fallback - less accurate but fast
try:
# Try statistical approximation
import numpy as np
mean = np.mean(data)
std = np.std(data)
# Simple threshold-based prediction
predictions = [
1 if (x - mean) / std > 1.5 else 0
for x in data
]
return {
"predictions": predictions,
"confidence": 0.7,
"method": "statistical_approximation"
}
except:
# Ultimate fallback
return {
"predictions": [0] * len(data),
"confidence": 0.3,
"method": "default"
}
def create_mock_response(self, endpoint: str) -> Dict[str, Any]:
"""Generate mock response based on endpoint pattern"""
mock_templates = {
"user": {
"id": "mock_123",
"name": "Mock User",
"status": "active"
},
"product": {
"id": "mock_prod",
"name": "Mock Product",
"available": True
},
"default": {
"status": "ok",
"message": "Service temporarily unavailable",
"mock": True
}
}
# Match endpoint to template
for key, template in mock_templates.items():
if key in endpoint:
return {
**template,
"degraded": True,
"generated_at": datetime.utcnow().isoformat()
}
return mock_templates["default"]
Communication During Degradation
Degradation Status Broadcasting
Keep all system components informed about degradation states and capabilities.
public class DegradationCommunicator : ModuleBase
{
private readonly DegradationState _state;
private readonly HashSet _subscribers;
private Timer _heartbeatTimer;
protected override async Task OnInitializeAsync()
{
// Start heartbeat with degradation status
_heartbeatTimer = new Timer(
async _ => await BroadcastStatusAsync(),
null,
TimeSpan.Zero,
TimeSpan.FromSeconds(30)
);
// Subscribe to degradation requests
await Messages.SubscribeAsync("degradation.query", HandleQueryAsync);
await Messages.SubscribeAsync("degradation.subscribe", HandleSubscribeAsync);
}
private async Task BroadcastStatusAsync()
{
var status = new DegradationStatus
{
ModuleId = ModuleId,
Level = _state.CurrentLevel,
ActiveFeatures = _state.GetActiveFeatures(),
DisabledFeatures = _state.GetDisabledFeatures(),
Capabilities = GetCurrentCapabilities(),
Metrics = new DegradationMetrics
{
ResponseTimeP95 = _metrics.GetPercentile(95),
ErrorRate = _metrics.GetErrorRate(),
ThroughputRatio = _metrics.GetThroughputRatio(),
QueueDepth = _metrics.GetQueueDepth()
},
NextReviewTime = DateTime.UtcNow.AddSeconds(30),
Timestamp = DateTime.UtcNow
};
// Broadcast to all subscribers
await Messages.PublishAsync("degradation.status", status);
// Update service registry
await UpdateServiceRegistryAsync(status);
}
private ServiceCapabilities GetCurrentCapabilities()
{
return new ServiceCapabilities
{
MaxThroughput = _state.CurrentLevel switch
{
DegradationLevel.Full => 1000,
DegradationLevel.Limited => 500,
DegradationLevel.Reduced => 200,
DegradationLevel.Essential => 50,
_ => 0
},
SupportedOperations = _state.GetSupportedOperations(),
ResponseTimeEstimate = _state.GetEstimatedResponseTime(),
AccuracyLevel = _state.GetAccuracyLevel(),
CacheOnly = _state.CurrentLevel == DegradationLevel.CacheOnly,
AcceptingTraffic = _state.IsAcceptingTraffic()
};
}
// Client notification with graceful degradation info
public async Task ProcessRequestAsync(Request request)
{
var response = new Response();
try
{
// Process with current capabilities
response.Data = await ProcessWithDegradationAsync(request);
response.Success = true;
}
catch (DegradedException ex)
{
response.Success = false;
response.DegradedMode = true;
response.Error = ex.Message;
}
// Always include degradation metadata
response.Metadata = new DegradationMetadata
{
ServiceLevel = _state.CurrentLevel.ToString(),
QualityIndicator = GetQualityIndicator(),
AlternativeEndpoints = GetAlternativeEndpoints(),
RetryAfter = GetRetryAfter(),
DegradationReason = _state.DegradationReason
};
// Add cache headers if in cache-only mode
if (_state.CurrentLevel == DegradationLevel.CacheOnly)
{
response.Headers["X-Cache-Mode"] = "true";
response.Headers["X-Cache-Age"] = GetCacheAge().ToString();
response.Headers["X-Stale-If-Error"] = "true";
}
return response;
}
// Negotiation protocol for degraded communication
public async Task NegotiateServiceLevelAsync(
ClientRequirements requirements)
{
var currentCapabilities = GetCurrentCapabilities();
// Check if we can meet minimum requirements
if (!CanMeetRequirements(requirements, currentCapabilities))
{
return new ServiceAgreement
{
Accepted = false,
Reason = "Cannot meet minimum requirements in current state",
AlternativeOptions = await GetAlternativeOptionsAsync(requirements)
};
}
// Negotiate best possible service level
return new ServiceAgreement
{
Accepted = true,
ServiceLevel = DetermineServiceLevel(requirements, currentCapabilities),
Guarantees = new ServiceGuarantees
{
MaxLatency = currentCapabilities.ResponseTimeEstimate,
MinAccuracy = currentCapabilities.AccuracyLevel,
Availability = CalculateAvailability(),
ValidUntil = DateTime.UtcNow.AddMinutes(5)
},
Limitations = GetCurrentLimitations(),
CommunicationProtocol = new Protocol
{
RetryStrategy = GetRetryStrategy(),
BackoffPolicy = GetBackoffPolicy(),
CircuitBreakerSettings = GetCircuitBreakerSettings()
}
};
}
}
Real-World Examples
Example 1: E-Commerce Platform Degradation
An e-commerce system that gracefully degrades during high load events like sales.
public class EcommerceDegradationModule : ModuleBase
{
private readonly FeatureFlags _features;
private readonly LoadBalancer _loadBalancer;
private DegradationLevel _currentLevel = DegradationLevel.Full;
protected override async Task OnInitializeAsync()
{
// Define degradation hierarchy
ConfigureDegradationLevels();
// Start monitoring
_ = Task.Run(MonitorAndAdaptAsync);
}
private void ConfigureDegradationLevels()
{
// Level 1: Disable recommendations
_features.Configure("personalized_recommendations",
enableWhen: () => _currentLevel <= DegradationLevel.Limited);
// Level 2: Simplify search
_features.Configure("advanced_search",
enableWhen: () => _currentLevel <= DegradationLevel.Reduced,
fallback: () => BasicSearchService());
// Level 3: Disable reviews and ratings
_features.Configure("reviews_and_ratings",
enableWhen: () => _currentLevel == DegradationLevel.Full);
// Level 4: Cart and checkout only
_features.Configure("browse_catalog",
enableWhen: () => _currentLevel < DegradationLevel.Essential);
}
public async Task SearchProductsAsync(
SearchQuery query)
{
return await _features.ExecuteAsync("advanced_search",
async () =>
{
// Full ML-powered search
var embeddings = await _mlService.GetEmbeddingsAsync(query.Text);
var results = await _searchEngine.VectorSearchAsync(embeddings);
return EnrichResults(results);
},
fallback: async () =>
{
// Basic keyword search
var results = await _database.KeywordSearchAsync(query.Text);
return new ProductSearchResult
{
Products = results.Take(20).ToList(),
TotalCount = results.Count,
SearchMethod = "keyword",
DegradedMode = true
};
});
}
public async Task AddToCartAsync(AddToCartRequest request)
{
// Critical path - always available but with degradation
var response = new CartResponse();
// Check inventory (with fallback)
var hasInventory = await CheckInventoryWithFallbackAsync(
request.ProductId,
request.Quantity
);
if (!hasInventory)
{
response.Success = false;
response.Message = "Product unavailable";
return response;
}
// Add to cart
await _cartService.AddItemAsync(request);
// Optional features based on degradation level
if (_currentLevel <= DegradationLevel.Limited)
{
response.Recommendations = await GetRecommendationsAsync(
request.ProductId
);
}
if (_currentLevel == DegradationLevel.Full)
{
response.PriceHistory = await GetPriceHistoryAsync(
request.ProductId
);
}
response.Success = true;
response.DegradationLevel = _currentLevel.ToString();
return response;
}
private async Task CheckInventoryWithFallbackAsync(
string productId,
int quantity)
{
try
{
// Real-time inventory check
return await _inventoryService.CheckAvailabilityAsync(
productId,
quantity
);
}
catch
{
// Fallback: Use cached inventory with safety margin
var cached = await _cache.GetAsync($"inventory:{productId}");
if (cached.HasValue)
{
// Apply safety margin for cached data
var safeQuantity = (int)(cached.Value * 0.8);
return quantity <= safeQuantity;
}
// Ultimate fallback: Assume available for popular items
var isPopular = await _cache.GetAsync($"popular:{productId}");
return isPopular;
}
}
}
Example 2: IoT Sensor Network Degradation
An IoT system that adjusts data collection and processing based on network conditions.
public class IoTDegradationModule : ModuleBase
{
private readonly SensorManager _sensorManager;
private readonly DataPipeline _pipeline;
private NetworkCondition _networkCondition = NetworkCondition.Good;
protected override async Task OnInitializeAsync()
{
// Monitor network conditions
_ = Task.Run(MonitorNetworkAsync);
// Configure adaptive sampling
ConfigureAdaptiveSampling();
}
private void ConfigureAdaptiveSampling()
{
_sensorManager.ConfigureSampling(new SamplingStrategy
{
[NetworkCondition.Good] = new SamplingConfig
{
Frequency = TimeSpan.FromSeconds(1),
BatchSize = 1,
Compression = CompressionLevel.None,
IncludeMetadata = true
},
[NetworkCondition.Fair] = new SamplingConfig
{
Frequency = TimeSpan.FromSeconds(5),
BatchSize = 10,
Compression = CompressionLevel.Medium,
IncludeMetadata = false
},
[NetworkCondition.Poor] = new SamplingConfig
{
Frequency = TimeSpan.FromSeconds(30),
BatchSize = 50,
Compression = CompressionLevel.High,
IncludeMetadata = false,
DeltaEncodingOnly = true
},
[NetworkCondition.Critical] = new SamplingConfig
{
Frequency = TimeSpan.FromMinutes(5),
BatchSize = 100,
Compression = CompressionLevel.Maximum,
AggregateOnly = true,
SendOnlyAnomalies = true
}
});
}
public async Task ProcessSensorDataAsync(SensorData data)
{
// Apply degradation based on network condition
switch (_networkCondition)
{
case NetworkCondition.Good:
// Full processing
await ProcessFullPipelineAsync(data);
break;
case NetworkCondition.Fair:
// Skip enrichment
await ProcessReducedPipelineAsync(data);
break;
case NetworkCondition.Poor:
// Local aggregation only
await AggregateLocallyAsync(data);
break;
case NetworkCondition.Critical:
// Store and forward
await StoreForLaterTransmissionAsync(data);
break;
}
}
private async Task ProcessFullPipelineAsync(SensorData data)
{
// Validate
var validated = await _pipeline.ValidateAsync(data);
// Enrich with metadata
var enriched = await _pipeline.EnrichAsync(validated);
// Analyze
var analysis = await _pipeline.AnalyzeAsync(enriched);
// Transmit immediately
await TransmitDataAsync(analysis);
}
private async Task AggregateLocallyAsync(SensorData data)
{
// Add to local buffer
_localBuffer.Add(data);
// Check if we should aggregate
if (_localBuffer.ShouldAggregate())
{
var aggregated = new AggregatedData
{
SensorId = data.SensorId,
StartTime = _localBuffer.OldestTimestamp,
EndTime = _localBuffer.NewestTimestamp,
SampleCount = _localBuffer.Count,
Statistics = CalculateStatistics(_localBuffer),
Anomalies = DetectAnomalies(_localBuffer)
};
// Transmit aggregated data
await TransmitCompressedAsync(aggregated);
_localBuffer.Clear();
}
}
// Edge computing fallback
private async Task EnableEdgeProcessingAsync()
{
Logger.Info("Enabling edge processing mode");
// Start local analytics
_edgeAnalytics.Start(new EdgeConfig
{
RetainDays = 7,
AnomalyThreshold = 3.0,
LocalAlertsOnly = true
});
// Switch to edge mode
await Messages.PublishAsync("mode.edge.activated", new
{
Reason = "Network degradation",
Capabilities = new[]
{
"anomaly_detection",
"basic_aggregation",
"local_alerts"
},
DataRetentionDays = 7
});
}
}
Best Practices
Graceful Degradation Guidelines
- Define Clear Priorities: Know which features are essential vs optional
- Communicate State: Always inform clients about degradation status
- Progressive Degradation: Degrade gradually rather than all at once
- Monitor Recovery: Automatically restore functionality when possible
- Test Degradation Paths: Regularly test all degradation scenarios
- Maintain Data Integrity: Never compromise data consistency
- User Experience Focus: Prioritize features users need most
- Clear Fallback Chain: Define explicit fallback sequences
- Resource Boundaries: Set clear limits on resource usage
- Metrics and Monitoring: Track degradation frequency and impact
Real-Time Monitoring
Monitor your NEXUS-1 system in real-time using the monitoring APIs, telemetry collection, and SDKs for building custom dashboards and monitoring applications.
Overview
Real-Time Monitoring Capabilities
NEXUS-1 provides multiple ways to monitor your system in real-time:
- Module Telemetry: Collect metrics, traces, and logs from modules
- NEXUS-1 SDK: Build external monitoring applications
- WebSocket Streaming: Subscribe to live data streams
- REST API: Query current state and historical data
- Integration: Export to Prometheus, Grafana, and other tools
NEXUS-1 SDK
Overview
The NEXUS-1 SDK enables external applications to connect to NEXUS-1 for real-time monitoring, control, and data visualization. Available for multiple platforms and languages.
Connecting to NEXUS-1
import { NexusClient } from '@nexus-1/client-sdk';
// Create client instance
const client = new NexusClient({
url: 'wss://nexus.example.com:8443',
auth: {
type: 'token',
token: process.env.NEXUS_API_TOKEN
},
reconnect: true,
reconnectInterval: 5000
});
// Connect to NEXUS-1
await client.connect();
// Handle connection events
client.on('connected', () => {
console.log('Connected to NEXUS-1');
});
client.on('disconnected', (reason) => {
console.log('Disconnected:', reason);
});
client.on('error', (error) => {
console.error('Connection error:', error);
});
using Nexus1.ClientSDK;
// Create client instance
var client = new NexusClient(new NexusClientOptions
{
ServerUrl = "wss://nexus.example.com:8443",
Authentication = new TokenAuthentication
{
Token = Environment.GetEnvironmentVariable("NEXUS_API_TOKEN")
},
AutoReconnect = true,
ReconnectInterval = TimeSpan.FromSeconds(5)
});
// Connect to NEXUS-1
await client.ConnectAsync();
// Handle connection events
client.Connected += (sender, e) =>
{
Console.WriteLine("Connected to NEXUS-1");
};
client.Disconnected += (sender, e) =>
{
Console.WriteLine($"Disconnected: {e.Reason}");
};
client.Error += (sender, e) =>
{
Console.Error.WriteLine($"Connection error: {e.Error}");
};
from nexus1_client import NexusClient
import asyncio
import os
# Create client instance
client = NexusClient(
url="wss://nexus.example.com:8443",
auth={
"type": "token",
"token": os.environ["NEXUS_API_TOKEN"]
},
auto_reconnect=True,
reconnect_interval=5.0
)
# Connect to NEXUS-1
async def main():
await client.connect()
# Handle connection events
@client.on("connected")
async def on_connected():
print("Connected to NEXUS-1")
@client.on("disconnected")
async def on_disconnected(reason):
print(f"Disconnected: {reason}")
@client.on("error")
async def on_error(error):
print(f"Connection error: {error}")
asyncio.run(main())
import io.nexus1.client.NexusClient;
import io.nexus1.client.NexusClientOptions;
import io.nexus1.client.auth.TokenAuthentication;
// Create client instance
NexusClient client = new NexusClient(
NexusClientOptions.builder()
.serverUrl("wss://nexus.example.com:8443")
.authentication(new TokenAuthentication(
System.getenv("NEXUS_API_TOKEN")
))
.autoReconnect(true)
.reconnectInterval(5000)
.build()
);
// Connect to NEXUS-1
client.connect();
// Handle connection events
client.onConnected(() -> {
System.out.println("Connected to NEXUS-1");
});
client.onDisconnected(reason -> {
System.out.println("Disconnected: " + reason);
});
client.onError(error -> {
System.err.println("Connection error: " + error);
});
package main
import (
"context"
"log"
"os"
"time"
nexus "github.com/nexus-1/client-sdk-go"
)
func main() {
// Create client instance
client := nexus.NewClient(&nexus.ClientOptions{
ServerURL: "wss://nexus.example.com:8443",
Auth: &nexus.TokenAuth{
Token: os.Getenv("NEXUS_API_TOKEN"),
},
AutoReconnect: true,
ReconnectInterval: 5 * time.Second,
})
// Connect to NEXUS-1
ctx := context.Background()
err := client.Connect(ctx)
if err != nil {
log.Fatal("Failed to connect:", err)
}
// Handle connection events
client.OnConnected(func() {
log.Println("Connected to NEXUS-1")
})
client.OnDisconnected(func(reason string) {
log.Printf("Disconnected: %s", reason)
})
client.OnError(func(err error) {
log.Printf("Connection error: %v", err)
})
}
Real-Time Data Subscription
Subscribing to Live Data Streams
Subscribe to real-time data streams from modules, including telemetry, events, and state changes.
// Subscribe to all temperature sensor data
const subscription = await client.subscribe({
topic: 'sensors.temperature.*',
onMessage: (message) => {
console.log('Temperature reading:', message.data);
updateDashboard(message.data);
}
});
// Subscribe to module telemetry
await client.subscribeTelemetry({
modules: ['temperature-monitor', 'pressure-monitor'],
metrics: ['temperature_celsius', 'pressure_bar'],
interval: 1000, // 1 second updates
onUpdate: (telemetry) => {
console.log('Telemetry update:', telemetry);
updateCharts(telemetry);
}
});
// Subscribe to system events
await client.subscribeEvents({
severity: ['warning', 'error', 'critical'],
modules: '*', // All modules
onEvent: (event) => {
console.log('System event:', event);
displayAlert(event);
}
});
// Subscribe to module state changes
await client.subscribeModuleStates({
onStateChange: (change) => {
console.log(`Module ${change.moduleId} state: ${change.newState}`);
updateModuleStatus(change);
}
});
// Unsubscribe when done
await subscription.unsubscribe();
// Subscribe to all temperature sensor data
var subscription = await client.SubscribeAsync(new SubscriptionOptions
{
Topic = "sensors.temperature.*",
OnMessage = (message) =>
{
Console.WriteLine($"Temperature reading: {message.Data}");
UpdateDashboard(message.Data);
}
});
// Subscribe to module telemetry
await client.SubscribeTelemetryAsync(new TelemetrySubscriptionOptions
{
Modules = new[] { "temperature-monitor", "pressure-monitor" },
Metrics = new[] { "temperature_celsius", "pressure_bar" },
Interval = TimeSpan.FromSeconds(1),
OnUpdate = (telemetry) =>
{
Console.WriteLine($"Telemetry update: {telemetry}");
UpdateCharts(telemetry);
}
});
// Subscribe to system events
await client.SubscribeEventsAsync(new EventSubscriptionOptions
{
Severity = new[] { EventSeverity.Warning, EventSeverity.Error, EventSeverity.Critical },
Modules = "*", // All modules
OnEvent = (evt) =>
{
Console.WriteLine($"System event: {evt}");
DisplayAlert(evt);
}
});
// Subscribe to module state changes
await client.SubscribeModuleStatesAsync(new ModuleStateOptions
{
OnStateChange = (change) =>
{
Console.WriteLine($"Module {change.ModuleId} state: {change.NewState}");
UpdateModuleStatus(change);
}
});
// Unsubscribe when done
await subscription.UnsubscribeAsync();
# Subscribe to all temperature sensor data
async def on_temperature(message):
print(f"Temperature reading: {message.data}")
await update_dashboard(message.data)
subscription = await client.subscribe(
topic="sensors.temperature.*",
on_message=on_temperature
)
# Subscribe to module telemetry
async def on_telemetry(telemetry):
print(f"Telemetry update: {telemetry}")
await update_charts(telemetry)
await client.subscribe_telemetry(
modules=["temperature-monitor", "pressure-monitor"],
metrics=["temperature_celsius", "pressure_bar"],
interval=1.0, # 1 second updates
on_update=on_telemetry
)
# Subscribe to system events
async def on_event(event):
print(f"System event: {event}")
await display_alert(event)
await client.subscribe_events(
severity=["warning", "error", "critical"],
modules="*", # All modules
on_event=on_event
)
# Subscribe to module state changes
async def on_state_change(change):
print(f"Module {change.module_id} state: {change.new_state}")
await update_module_status(change)
await client.subscribe_module_states(
on_state_change=on_state_change
)
# Unsubscribe when done
await subscription.unsubscribe()
Querying Real-Time Metrics
REST API for Metrics
Query current metrics and historical data using the REST API.
// Get current metrics for all modules
GET /api/v1/metrics/current
Authorization: Bearer YOUR_API_TOKEN
// Response
{
"modules": {
"temperature-monitor": {
"temperature_celsius": {
"value": 23.5,
"labels": {
"sensor_id": "sensor-1",
"location": "room-1"
},
"timestamp": "2024-01-15T10:30:00Z"
},
"readings_per_second": {
"value": 10.5,
"timestamp": "2024-01-15T10:30:00Z"
}
}
}
}
// Query time-series data
GET /api/v1/metrics/query?metric=temperature_celsius&start=-1h&step=1m
Authorization: Bearer YOUR_API_TOKEN
// Response
{
"metric": "temperature_celsius",
"values": [
{
"timestamp": "2024-01-15T09:30:00Z",
"value": 22.1,
"labels": {"sensor_id": "sensor-1"}
},
{
"timestamp": "2024-01-15T09:31:00Z",
"value": 22.3,
"labels": {"sensor_id": "sensor-1"}
}
// ... more data points
]
}
GraphQL API
Use GraphQL for flexible querying of real-time and historical data.
query GetRealTimeData {
modules {
id
name
state
health {
status
lastCheck
details
}
metrics(last: "5m") {
name
current
average
max
min
}
events(severity: ["warning", "error"], limit: 10) {
timestamp
severity
message
details
}
}
telemetry {
temperature_celsius(
modules: ["temperature-monitor"],
interval: "1m",
aggregation: AVG
) {
timestamp
value
labels
}
}
}
# Subscribe to real-time updates
subscription ModuleUpdates {
moduleStateChanged {
moduleId
previousState
newState
timestamp
}
metricUpdate(
metrics: ["temperature_celsius", "pressure_bar"]
) {
metric
value
labels
timestamp
}
systemEvent(severity: ["error", "critical"]) {
id
timestamp
severity
module
message
stackTrace
}
}
Building Real-Time Dashboards
React Dashboard Example
import React, { useEffect, useState } from 'react';
import { NexusClient } from '@nexus-1/client-sdk';
import { LineChart, Line, XAxis, YAxis, CartesianGrid, Tooltip, Legend } from 'recharts';
function TemperatureDashboard() {
const [client, setClient] = useState(null);
const [data, setData] = useState([]);
const [modules, setModules] = useState([]);
const [alerts, setAlerts] = useState([]);
useEffect(() => {
// Initialize client
const nexusClient = new NexusClient({
url: process.env.REACT_APP_NEXUS_URL,
auth: { token: process.env.REACT_APP_NEXUS_TOKEN }
});
nexusClient.connect().then(() => {
setClient(nexusClient);
// Subscribe to temperature data
nexusClient.subscribeTelemetry({
metrics: ['temperature_celsius'],
interval: 1000,
onUpdate: (telemetry) => {
setData(prev => {
const newData = [...prev, {
time: new Date().toLocaleTimeString(),
...telemetry.metrics
}];
// Keep last 100 points
return newData.slice(-100);
});
}
});
// Subscribe to module states
nexusClient.subscribeModuleStates({
onStateChange: (change) => {
setModules(prev =>
prev.map(m =>
m.id === change.moduleId
? { ...m, state: change.newState }
: m
)
);
}
});
// Subscribe to alerts
nexusClient.subscribeEvents({
severity: ['warning', 'error', 'critical'],
onEvent: (event) => {
setAlerts(prev => [{
id: Date.now(),
...event
}, ...prev].slice(0, 50));
}
});
});
return () => {
if (nexusClient) {
nexusClient.disconnect();
}
};
}, []);
return (
<div className="dashboard">
<h1>NEXUS-1 Temperature Monitoring</h1>
<div className="metrics-grid">
<div className="chart-container">
<h2>Temperature Trend</h2>
<LineChart width={600} height={300} data={data}>
<CartesianGrid strokeDasharray="3 3" />
<XAxis dataKey="time" />
<YAxis />
<Tooltip />
<Legend />
<Line
type="monotone"
dataKey="temperature_celsius"
stroke="#8884d8"
activeDot={{ r: 8 }}
/>
</LineChart>
</div>
<div className="modules-status">
<h2>Module Status</h2>
<div className="module-list">
{modules.map(module => (
<div key={module.id} className={`module-card ${module.state}`}>
<h3>{module.name}</h3>
<p>State: {module.state}</p>
<p>Health: {module.health?.status || 'Unknown'}</p>
</div>
))}
</div>
</div>
<div className="alerts-panel">
<h2>Recent Alerts</h2>
<div className="alert-list">
{alerts.map(alert => (
<div key={alert.id} className={`alert ${alert.severity}`}>
<span className="time">{alert.timestamp}</span>
<span className="module">{alert.module}</span>
<span className="message">{alert.message}</span>
</div>
))}
</div>
</div>
</div>
</div>
);
}
Vue.js Dashboard Example
<template>
<div class="nexus-dashboard">
<h1>NEXUS-1 System Monitor</h1>
<div class="connection-status" :class="connectionStatus">
{{ connected ? 'Connected' : 'Disconnected' }}
</div>
<div class="dashboard-grid">
<!-- Real-time metrics -->
<div class="metric-card" v-for="metric in metrics" :key="metric.name">
<h3>{{ metric.name }}</h3>
<div class="metric-value">{{ metric.value }} {{ metric.unit }}</div>
<div class="metric-trend">
<sparkline :data="metric.history" />
</div>
</div>
<!-- Module health grid -->
<div class="health-grid">
<div
v-for="module in modules"
:key="module.id"
class="health-tile"
:class="module.health.status"
>
<div class="module-name">{{ module.name }}</div>
<div class="module-status">{{ module.state }}</div>
</div>
</div>
<!-- Event stream -->
<div class="event-stream">
<h2>Live Events</h2>
<transition-group name="event-list" tag="div">
<div
v-for="event in events"
:key="event.id"
class="event-item"
:class="event.severity"
>
<span class="timestamp">{{ formatTime(event.timestamp) }}</span>
<span class="message">{{ event.message }}</span>
</div>
</transition-group>
</div>
</div>
</div>
</template>
<script>
import { ref, reactive, onMounted, onUnmounted } from 'vue';
import { NexusClient } from '@nexus-1/client-sdk';
export default {
setup() {
const client = ref(null);
const connected = ref(false);
const metrics = reactive([]);
const modules = reactive([]);
const events = reactive([]);
onMounted(async () => {
// Initialize NEXUS client
client.value = new NexusClient({
url: import.meta.env.VITE_NEXUS_URL,
auth: { token: import.meta.env.VITE_NEXUS_TOKEN }
});
// Connect and set up subscriptions
await client.value.connect();
connected.value = true;
// Subscribe to metrics
await client.value.subscribeTelemetry({
metrics: ['*'],
interval: 1000,
onUpdate: (data) => {
updateMetrics(data);
}
});
// Subscribe to module states
await client.value.subscribeModuleStates({
onStateChange: (change) => {
updateModuleState(change);
}
});
// Subscribe to events
await client.value.subscribeEvents({
onEvent: (event) => {
events.unshift({
...event,
id: Date.now()
});
// Keep last 100 events
if (events.length > 100) {
events.pop();
}
}
});
});
onUnmounted(() => {
if (client.value) {
client.value.disconnect();
}
});
return {
connected,
metrics,
modules,
events,
formatTime
};
}
};
</script>
Mobile Monitoring Apps
React Native Example
import React, { useEffect, useState } from 'react';
import {
SafeAreaView,
ScrollView,
View,
Text,
StyleSheet,
RefreshControl
} from 'react-native';
import { NexusClient } from '@nexus-1/client-sdk';
import { LineChart } from 'react-native-chart-kit';
const NexusMonitorApp = () => {
const [client, setClient] = useState(null);
const [connected, setConnected] = useState(false);
const [modules, setModules] = useState([]);
const [metrics, setMetrics] = useState({});
const [refreshing, setRefreshing] = useState(false);
useEffect(() => {
initializeClient();
return () => {
if (client) {
client.disconnect();
}
};
}, []);
const initializeClient = async () => {
const nexusClient = new NexusClient({
url: Config.NEXUS_URL,
auth: { token: Config.NEXUS_TOKEN }
});
nexusClient.on('connected', () => setConnected(true));
nexusClient.on('disconnected', () => setConnected(false));
await nexusClient.connect();
setClient(nexusClient);
// Subscribe to real-time data
subscribeToData(nexusClient);
};
const subscribeToData = (nexusClient) => {
// Module states
nexusClient.subscribeModuleStates({
onStateChange: (change) => {
setModules(prev =>
prev.map(m =>
m.id === change.moduleId
? { ...m, ...change }
: m
)
);
}
});
// Telemetry
nexusClient.subscribeTelemetry({
interval: 2000,
onUpdate: (data) => {
setMetrics(prev => ({
...prev,
...data.metrics
}));
}
});
};
const onRefresh = async () => {
setRefreshing(true);
// Fetch latest data
const currentModules = await client.getModules();
setModules(currentModules);
setRefreshing(false);
};
return (
<SafeAreaView style={styles.container}>
<View style={styles.header}>
<Text style={styles.title}>NEXUS-1 Monitor</Text>
<View style={[styles.connectionIndicator,
{ backgroundColor: connected ? '#4CAF50' : '#F44336' }]} />
</View>
<ScrollView
refreshControl={
<RefreshControl
refreshing={refreshing}
onRefresh={onRefresh}
/>
}
>
<View style={styles.section}>
<Text style={styles.sectionTitle}>System Metrics</Text>
{Object.entries(metrics).map(([key, value]) => (
<View key={key} style={styles.metricRow}>
<Text style={styles.metricName}>{key}</Text>
<Text style={styles.metricValue}>{value}</Text>
</View>
))}
</View>
<View style={styles.section}>
<Text style={styles.sectionTitle}>Module Status</Text>
{modules.map(module => (
<View key={module.id} style={styles.moduleCard}>
<Text style={styles.moduleName}>{module.name}</Text>
<Text style={[styles.moduleState,
{ color: module.state === 'running' ? '#4CAF50' : '#FF9800' }]}>
{module.state}
</Text>
</View>
))}
</View>
</ScrollView>
</SafeAreaView>
);
};
const styles = StyleSheet.create({
container: {
flex: 1,
backgroundColor: '#f5f5f5',
},
header: {
flexDirection: 'row',
justifyContent: 'space-between',
alignItems: 'center',
padding: 16,
backgroundColor: '#fff',
elevation: 2,
},
title: {
fontSize: 20,
fontWeight: 'bold',
},
connectionIndicator: {
width: 12,
height: 12,
borderRadius: 6,
},
section: {
margin: 16,
padding: 16,
backgroundColor: '#fff',
borderRadius: 8,
elevation: 1,
},
sectionTitle: {
fontSize: 18,
fontWeight: '600',
marginBottom: 12,
},
metricRow: {
flexDirection: 'row',
justifyContent: 'space-between',
paddingVertical: 8,
borderBottomWidth: 1,
borderBottomColor: '#e0e0e0',
},
metricName: {
fontSize: 14,
color: '#666',
},
metricValue: {
fontSize: 16,
fontWeight: '500',
},
moduleCard: {
flexDirection: 'row',
justifyContent: 'space-between',
alignItems: 'center',
paddingVertical: 12,
borderBottomWidth: 1,
borderBottomColor: '#e0e0e0',
},
moduleName: {
fontSize: 16,
},
moduleState: {
fontSize: 14,
fontWeight: '600',
},
});
Integration with Monitoring Tools
Grafana Integration
NEXUS-1 provides a Grafana data source plugin for seamless integration.
# Install NEXUS-1 Grafana plugin
grafana-cli plugins install nexus1-datasource
# Configure in Grafana UI
# 1. Go to Configuration > Data Sources
# 2. Add data source > NEXUS-1
# 3. Configure:
# - URL: https://nexus.example.com:8443
# - API Token: YOUR_TOKEN
# - Default Module: (optional)
# Example Grafana query
{
"target": "temperature_celsius",
"module": "temperature-monitor",
"aggregation": "avg",
"interval": "5m"
}
Prometheus Integration
Export metrics in Prometheus format for scraping.
# Prometheus configuration
scrape_configs:
- job_name: 'nexus1'
static_configs:
- targets: ['nexus.example.com:9090']
bearer_token: 'YOUR_API_TOKEN'
metrics_path: '/api/v1/metrics/prometheus'
scrape_interval: 30s
Security Considerations
Important Security Notes
- Always use TLS/SSL for client connections
- Implement proper authentication and authorization
- Use API tokens with minimal required permissions
- Enable rate limiting to prevent abuse
- Audit all client connections and data access
Module Telemetry
Overview
Module telemetry enables monitoring of module performance, health, and operational metrics. This data can be consumed by external monitoring applications using the SDK.
Telemetry
Implement telemetry collection and monitoring in your modules to track performance, health, and operational metrics.
Telemetry Overview
Why Telemetry Matters
Telemetry enables real-time visibility into module performance, system health, and operational patterns. It's essential for:
- Performance monitoring and optimization
- Predictive maintenance and anomaly detection
- Capacity planning and resource utilization
- Troubleshooting and root cause analysis
- SLA compliance and reporting
SDK Telemetry APIs
Metric Types
Counter
Monotonically increasing values (e.g., total requests, errors)
Gauge
Point-in-time values that can increase or decrease (e.g., temperature, queue size)
Histogram
Distribution of values over time (e.g., response times, payload sizes)
Summary
Statistical aggregations with percentiles (e.g., p50, p95, p99)
Implementation Examples
using Nexus.SDK.Telemetry;
public class TemperatureModule : ModuleBase
{
private readonly IMetrics _metrics;
private readonly ICounter _readingsCounter;
private readonly IGauge _temperatureGauge;
private readonly IHistogram _processingTime;
private readonly ISummary _batchSizes;
public TemperatureModule()
{
// Initialize metrics from module context
_metrics = Context.Metrics;
// Define metrics with labels for dimensional data
_readingsCounter = _metrics.CreateCounter(
"temperature_readings_total",
"Total number of temperature readings processed",
new[] { "sensor_id", "location" }
);
_temperatureGauge = _metrics.CreateGauge(
"temperature_celsius",
"Current temperature in Celsius",
new[] { "sensor_id", "location" }
);
_processingTime = _metrics.CreateHistogram(
"temperature_processing_duration_ms",
"Time to process temperature reading in milliseconds",
new[] { "sensor_id" },
new[] { 0.5, 1, 2, 5, 10, 20, 50, 100 } // Bucket boundaries
);
_batchSizes = _metrics.CreateSummary(
"temperature_batch_size",
"Size of temperature reading batches",
new[] { "source" },
new[] { 0.5, 0.9, 0.95, 0.99 } // Percentiles
);
}
protected override async Task OnInitializeAsync()
{
// Subscribe to temperature readings
await Messages.SubscribeAsync("sensors.temperature.*", ProcessTemperature);
// Report module health metrics
_metrics.CreateGauge("module_health_score", "Module health score 0-100")
.Set(100);
// Track initialization
_metrics.CreateCounter("module_initializations_total")
.WithLabels("module", ModuleInfo.Name)
.Increment();
}
private async Task ProcessTemperature(Message message)
{
using var timer = _processingTime
.WithLabels(message.Properties["sensor_id"])
.StartTimer();
try
{
var reading = message.GetPayload();
// Update metrics
_readingsCounter
.WithLabels(reading.SensorId, reading.Location)
.Increment();
_temperatureGauge
.WithLabels(reading.SensorId, reading.Location)
.Set(reading.Temperature);
// Process the reading
await ProcessReading(reading);
// Track successful processing
_metrics.CreateCounter("temperature_processing_success_total")
.WithLabels("sensor_id", reading.SensorId)
.Increment();
}
catch (Exception ex)
{
// Track failures
_metrics.CreateCounter("temperature_processing_errors_total")
.WithLabels("sensor_id", message.Properties["sensor_id"], "error", ex.GetType().Name)
.Increment();
throw;
}
}
// Custom metrics for business logic
public async Task ProcessBatch(List readings)
{
// Track batch size distribution
_batchSizes
.WithLabels("api")
.Observe(readings.Count);
// Track temperature anomalies
var anomalies = readings.Count(r => r.Temperature > 100 || r.Temperature < -50);
if (anomalies > 0)
{
_metrics.CreateCounter("temperature_anomalies_total")
.WithLabels("type", "out_of_range")
.Increment(anomalies);
}
}
}
from nexus_sdk import Module, Metrics
from nexus_sdk.telemetry import Counter, Gauge, Histogram, Summary
import time
class TemperatureModule(Module):
def __init__(self):
super().__init__()
# Initialize metrics
self.metrics = self.context.metrics
# Define metrics with labels
self.readings_counter = self.metrics.counter(
'temperature_readings_total',
'Total number of temperature readings processed',
labels=['sensor_id', 'location']
)
self.temperature_gauge = self.metrics.gauge(
'temperature_celsius',
'Current temperature in Celsius',
labels=['sensor_id', 'location']
)
self.processing_time = self.metrics.histogram(
'temperature_processing_duration_ms',
'Time to process temperature reading in milliseconds',
labels=['sensor_id'],
buckets=[0.5, 1, 2, 5, 10, 20, 50, 100]
)
self.batch_sizes = self.metrics.summary(
'temperature_batch_size',
'Size of temperature reading batches',
labels=['source'],
quantiles=[0.5, 0.9, 0.95, 0.99]
)
async def on_initialize(self):
# Subscribe to temperature readings
await self.messages.subscribe('sensors.temperature.*', self.process_temperature)
# Report module health metrics
self.metrics.gauge('module_health_score', 'Module health score 0-100').set(100)
# Track initialization
self.metrics.counter('module_initializations_total') .labels(module=self.info.name) .increment()
async def process_temperature(self, message):
# Time the processing
with self.processing_time.labels(sensor_id=message.properties['sensor_id']).time():
try:
reading = message.get_payload(TemperatureReading)
# Update metrics
self.readings_counter .labels(sensor_id=reading.sensor_id, location=reading.location) .increment()
self.temperature_gauge .labels(sensor_id=reading.sensor_id, location=reading.location) .set(reading.temperature)
# Process the reading
await self.process_reading(reading)
# Track successful processing
self.metrics.counter('temperature_processing_success_total') .labels(sensor_id=reading.sensor_id) .increment()
except Exception as e:
# Track failures
self.metrics.counter('temperature_processing_errors_total') .labels(
sensor_id=message.properties['sensor_id'],
error=type(e).__name__
) .increment()
raise
# Custom metrics for business logic
async def process_batch(self, readings):
# Track batch size distribution
self.batch_sizes.labels(source='api').observe(len(readings))
# Track temperature anomalies
anomalies = sum(1 for r in readings if r.temperature > 100 or r.temperature < -50)
if anomalies > 0:
self.metrics.counter('temperature_anomalies_total') .labels(type='out_of_range') .increment(anomalies)
# Export custom metrics
def export_metrics(self):
# Calculate derived metrics
success_rate = self.calculate_success_rate()
self.metrics.gauge('temperature_processing_success_rate') .set(success_rate)
# Export to monitoring system
return self.metrics.export(format='prometheus')
#include <nexus/telemetry.hpp>
#include <chrono>
class TemperatureModule : public nexus::ModuleBase {
private:
std::shared_ptr<nexus::IMetrics> metrics_;
std::shared_ptr<nexus::Counter> readings_counter_;
std::shared_ptr<nexus::Gauge> temperature_gauge_;
std::shared_ptr<nexus::Histogram> processing_time_;
std::shared_ptr<nexus::Summary> batch_sizes_;
public:
TemperatureModule() : ModuleBase("TemperatureModule", "1.0.0") {
// Initialize metrics from context
metrics_ = context()->metrics();
// Define metrics with labels
readings_counter_ = metrics_->create_counter(
"temperature_readings_total",
"Total number of temperature readings processed",
{"sensor_id", "location"}
);
temperature_gauge_ = metrics_->create_gauge(
"temperature_celsius",
"Current temperature in Celsius",
{"sensor_id", "location"}
);
processing_time_ = metrics_->create_histogram(
"temperature_processing_duration_ms",
"Time to process temperature reading in milliseconds",
{"sensor_id"},
{0.5, 1, 2, 5, 10, 20, 50, 100} // Bucket boundaries
);
batch_sizes_ = metrics_->create_summary(
"temperature_batch_size",
"Size of temperature reading batches",
{"source"},
{0.5, 0.9, 0.95, 0.99} // Quantiles
);
}
protected:
async_task<void> on_initialize() override {
// Subscribe to temperature readings
co_await messages()->subscribe(
"sensors.temperature.*",
[this](auto msg) { return process_temperature(msg); }
);
// Report module health metrics
metrics_->create_gauge("module_health_score", "Module health score 0-100")
->set(100);
// Track initialization
metrics_->create_counter("module_initializations_total")
->with_labels({"module", info().name})
->increment();
}
async_task<void> process_temperature(const nexus::Message& message) {
// Time the processing
auto timer = processing_time_
->with_labels({message.properties().at("sensor_id")})
->start_timer();
try {
auto reading = message.get_payload<TemperatureReading>();
// Update metrics
readings_counter_
->with_labels({reading.sensor_id, reading.location})
->increment();
temperature_gauge_
->with_labels({reading.sensor_id, reading.location})
->set(reading.temperature);
// Process the reading
co_await process_reading(reading);
// Track successful processing
metrics_->create_counter("temperature_processing_success_total")
->with_labels({"sensor_id", reading.sensor_id})
->increment();
} catch (const std::exception& e) {
// Track failures
metrics_->create_counter("temperature_processing_errors_total")
->with_labels({
{"sensor_id", message.properties().at("sensor_id")},
{"error", typeid(e).name()}
})
->increment();
throw;
}
}
// Custom metrics for business logic
async_task<void> process_batch(const std::vector<TemperatureReading>& readings) {
// Track batch size distribution
batch_sizes_->with_labels({"api"})->observe(readings.size());
// Track temperature anomalies
auto anomalies = std::count_if(readings.begin(), readings.end(),
[](const auto& r) { return r.temperature > 100 || r.temperature < -50; });
if (anomalies > 0) {
metrics_->create_counter("temperature_anomalies_total")
->with_labels({"type", "out_of_range"})
->increment(anomalies);
}
co_return;
}
// Export metrics in Prometheus format
std::string export_metrics() const {
return metrics_->export_prometheus();
}
};
classdef TemperatureModule < nexus.Module
properties (Access = private)
metrics
readingsCounter
temperatureGauge
processingTime
batchSizes
end
methods
function obj = TemperatureModule()
obj@nexus.Module('TemperatureModule', '1.0.0');
% Initialize metrics
obj.metrics = obj.context.metrics;
% Define metrics with labels
obj.readingsCounter = obj.metrics.createCounter(...
'temperature_readings_total', ...
'Total number of temperature readings processed', ...
{'sensor_id', 'location'});
obj.temperatureGauge = obj.metrics.createGauge(...
'temperature_celsius', ...
'Current temperature in Celsius', ...
{'sensor_id', 'location'});
obj.processingTime = obj.metrics.createHistogram(...
'temperature_processing_duration_ms', ...
'Time to process temperature reading in milliseconds', ...
{'sensor_id'}, ...
[0.5, 1, 2, 5, 10, 20, 50, 100]); % Bucket boundaries
obj.batchSizes = obj.metrics.createSummary(...
'temperature_batch_size', ...
'Size of temperature reading batches', ...
{'source'}, ...
[0.5, 0.9, 0.95, 0.99]); % Quantiles
end
end
methods (Access = protected)
function onInitialize(obj)
% Subscribe to temperature readings
obj.messages.subscribe('sensors.temperature.*', @obj.processTemperature);
% Report module health metrics
obj.metrics.createGauge('module_health_score', ...
'Module health score 0-100').set(100);
% Track initialization
obj.metrics.createCounter('module_initializations_total') ...
.withLabels('module', obj.info.name) ...
.increment();
end
function processTemperature(obj, message)
% Time the processing
timer = obj.processingTime ...
.withLabels(message.properties.sensor_id) ...
.startTimer();
try
reading = message.getPayload('TemperatureReading');
% Update metrics
obj.readingsCounter ...
.withLabels(reading.sensorId, reading.location) ...
.increment();
obj.temperatureGauge ...
.withLabels(reading.sensorId, reading.location) ...
.set(reading.temperature);
% Process the reading
obj.processReading(reading);
% Track successful processing
obj.metrics.createCounter('temperature_processing_success_total') ...
.withLabels('sensor_id', reading.sensorId) ...
.increment();
% Stop timer
timer.observeDuration();
catch ME
% Track failures
obj.metrics.createCounter('temperature_processing_errors_total') ...
.withLabels('sensor_id', message.properties.sensor_id, ...
'error', ME.identifier) ...
.increment();
% Stop timer
timer.observeDuration();
rethrow(ME);
end
end
% Custom metrics for business logic
function processBatch(obj, readings)
% Track batch size distribution
obj.batchSizes.withLabels('api').observe(length(readings));
% Track temperature anomalies
anomalies = sum([readings.temperature] > 100 | ...
[readings.temperature] < -50);
if anomalies > 0
obj.metrics.createCounter('temperature_anomalies_total') ...
.withLabels('type', 'out_of_range') ...
.increment(anomalies);
end
end
% Export metrics
function metrics = exportMetrics(obj)
% Calculate derived metrics
successRate = obj.calculateSuccessRate();
obj.metrics.createGauge('temperature_processing_success_rate') ...
.set(successRate);
% Export in Prometheus format
metrics = obj.metrics.export('prometheus');
end
end
end
// LabVIEW Telemetry Implementation
// TemperatureModule.lvclass
// Initialize Metrics.vi
// Creates and configures all metrics for the module
1. Get Metrics Reference from Context
- Wire Module Context to property node
- Get Metrics reference
2. Create Counter - Temperature Readings
- Metric Name: "temperature_readings_total"
- Description: "Total number of temperature readings processed"
- Labels: ["sensor_id", "location"]
- Store reference in class private data
3. Create Gauge - Current Temperature
- Metric Name: "temperature_celsius"
- Description: "Current temperature in Celsius"
- Labels: ["sensor_id", "location"]
- Store reference in class private data
4. Create Histogram - Processing Time
- Metric Name: "temperature_processing_duration_ms"
- Description: "Time to process temperature reading in milliseconds"
- Labels: ["sensor_id"]
- Buckets: [0.5, 1, 2, 5, 10, 20, 50, 100]
- Store reference in class private data
5. Create Summary - Batch Sizes
- Metric Name: "temperature_batch_size"
- Description: "Size of temperature reading batches"
- Labels: ["source"]
- Quantiles: [0.5, 0.9, 0.95, 0.99]
- Store reference in class private data
// Process Temperature.vi
// Handles individual temperature readings with metrics
1. Start Timer
- Get Processing Time Histogram reference
- Call With Labels.vi
* Label Values: [Message.Properties.sensor_id]
- Call Start Timer.vi
- Wire timer reference through structure
2. Try-Catch Structure
Try Case:
a. Parse Message
- Get Payload as TemperatureReading cluster
b. Update Counter
- Get Readings Counter reference
- Call With Labels.vi
* Label Values: [reading.sensorId, reading.location]
- Call Increment.vi
c. Update Gauge
- Get Temperature Gauge reference
- Call With Labels.vi
* Label Values: [reading.sensorId, reading.location]
- Call Set.vi
* Value: reading.temperature
d. Process Reading
- Call module's Process Reading.vi
e. Track Success
- Create/Get Counter "temperature_processing_success_total"
- With Labels: ["sensor_id", reading.sensorId]
- Increment
f. Stop Timer
- Call Observe Duration.vi on timer reference
Catch Case:
a. Track Error
- Create/Get Counter "temperature_processing_errors_total"
- With Labels: ["sensor_id", message.sensor_id, "error", error_source]
- Increment
b. Stop Timer
- Call Observe Duration.vi on timer reference
c. Propagate Error
// Process Batch.vi
// Custom metrics for batch processing
1. Track Batch Size
- Get Batch Sizes Summary reference
- Call With Labels.vi
* Label Values: ["api"]
- Call Observe.vi
* Value: Array Size of readings
2. Count Anomalies
- For Loop through readings array
- Check if temperature > 100 OR temperature < -50
- Count anomalies
3. If anomalies > 0
- Create/Get Counter "temperature_anomalies_total"
- With Labels: ["type", "out_of_range"]
- Increment by anomaly count
// Export Metrics.vi
// Exports metrics in various formats
1. Calculate Derived Metrics
- Calculate success rate from counters
- Create/Update Gauge "temperature_processing_success_rate"
- Set value
2. Export Format Case Structure
Case "prometheus":
- Call Metrics.Export Prometheus.vi
- Return formatted string
Case "json":
- Call Metrics.Export JSON.vi
- Return JSON string
Case "influx":
- Call Metrics.Export InfluxDB.vi
- Return line protocol string
// Module Health Reporting.vi
// Periodic health metrics update
1. Calculate Health Score (0-100)
- Check error rates
- Check processing times
- Check queue sizes
2. Update Health Gauge
- Get/Create Gauge "module_health_score"
- Set calculated value
3. Update Status Metrics
- Memory usage gauge
- CPU usage gauge
- Queue depth gauge
Telemetry Best Practices
Metric Design Guidelines
- Use Standard Naming: Follow Prometheus naming conventions (snake_case, unit suffixes)
- Add Meaningful Labels: Enable filtering and aggregation, but avoid high cardinality
- Choose Appropriate Types: Counter for totals, Gauge for current values, Histogram for distributions
- Include Units: Add unit suffixes to metric names (_seconds, _bytes, _celsius)
- Document Metrics: Provide clear descriptions for each metric
- Avoid Metrics Explosion: Limit label combinations to prevent cardinality issues
- Pre-aggregate When Possible: Use histograms and summaries for efficient percentile calculations
- Export Regularly: Configure appropriate scrape intervals (typically 15-60 seconds)
- Monitor the Monitors: Track metrics collection performance and errors
- Use Consistent Labels: Standardize label names across all metrics
Integration with Monitoring Systems
Supported Export Formats
- Prometheus: Text-based exposition format for Prometheus scraping
- OpenTelemetry: OTLP protocol for cloud-native observability
- StatsD: Simple UDP protocol for metrics aggregation
- InfluxDB: Line protocol for time-series databases
- CloudWatch: AWS CloudWatch metrics API
- Azure Monitor: Azure Application Insights format
Configuration Example
telemetry:
enabled: true
export:
format: prometheus
endpoint: /metrics
port: 9090
interval: 30s
# Additional exporters
exporters:
- type: otlp
endpoint: http://otel-collector:4317
headers:
api-key: ${OTEL_API_KEY}
- type: cloudwatch
region: us-east-1
namespace: Nexus/Modules
# Metric filtering
filters:
include:
- temperature_*
- module_health_*
exclude:
- *_debug_*
Performance Considerations
Telemetry Performance Tips
- Use Sampling: For high-frequency metrics, consider sampling strategies
- Batch Exports: Aggregate metrics before sending to reduce network overhead
- Async Collection: Don't block business logic for metrics collection
- Local Aggregation: Pre-aggregate on the module side when possible
- Bounded Cardinality: Limit unique label combinations to prevent memory issues
- Efficient Serialization: Use binary protocols for high-volume metrics
Alerting and Dashboards
Example Alert Rules
# Prometheus Alert Rules
groups:
- name: temperature_module_alerts
rules:
- alert: HighErrorRate
expr: |
rate(temperature_processing_errors_total[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "High error rate in temperature processing"
- alert: TemperatureAnomaly
expr: |
temperature_celsius > 100 or temperature_celsius < -50
for: 1m
labels:
severity: critical
annotations:
summary: "Temperature reading outside valid range"
- alert: ModuleUnhealthy
expr: |
module_health_score < 50
for: 10m
labels:
severity: warning
annotations:
summary: "Module health degraded"
Dashboard Best Practices
- Group related metrics together
- Use appropriate visualization types (gauges, graphs, heatmaps)
- Include both current values and historical trends
- Add threshold indicators for quick status assessment
- Create drill-down dashboards for detailed analysis
- Use annotations to mark deployments and incidents
Cloud-Native Module Development
Deploy NEXUS-1 modules as containers in cloud-native environments for enhanced scalability, portability, and resource management. This section covers containerization strategies for all supported languages.
Benefits of Container Deployment
Why Containerize NEXUS-1 Modules?
- Isolation: Each module runs in its own container with isolated resources and dependencies
- Portability: Deploy modules consistently across development, staging, and production environments
- Scalability: Leverage Kubernetes for automatic scaling based on load and resource usage
- Resource Management: Fine-grained control over CPU, memory, and GPU allocation
- Version Control: Container images provide immutable module versions with rollback capability
- DevOps Integration: Seamless CI/CD pipeline integration with automated testing and deployment
Container Architecture
Container Configuration
Module Manifest Configuration
Configure container deployment in your module manifest:
modules:
- moduleId: "my-module"
name: "My Container Module"
deployment:
type: container
container:
image: "myregistry/my-module:1.0"
imagePullPolicy: IfNotPresent
resources:
limits:
cpu: "2"
memory: "2Gi"
nvidia.com/gpu: "1" # Optional GPU allocation
requests:
cpu: "500m"
memory: "512Mi"
environment:
- name: LOG_LEVEL
value: "info"
- name: NEXUS_MODULE_ID
valueFrom:
fieldRef:
fieldPath: metadata.name
volumeMounts:
- name: config
mountPath: /config
readOnly: true
- name: data
mountPath: /data
securityContext:
runAsNonRoot: true
runAsUser: 1000
readOnlyRootFilesystem: true
livenessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 30
periodSeconds: 30
readinessProbe:
httpGet:
path: /ready
port: 8080
initialDelaySeconds: 5
periodSeconds: 10
Language-Specific Container Examples
C# Module Containerization
1. Create Multi-Stage Dockerfile:
# Build stage
FROM mcr.microsoft.com/dotnet/sdk:8.0-alpine AS build
WORKDIR /src
# Copy project files
COPY ["MyModule.csproj", "./"]
RUN dotnet restore "MyModule.csproj"
# Copy source code and build
COPY . .
RUN dotnet build "MyModule.csproj" -c Release -o /app/build
RUN dotnet publish "MyModule.csproj" -c Release -o /app/publish /p:UseAppHost=false
# Runtime stage
FROM mcr.microsoft.com/dotnet/runtime:8.0-alpine AS runtime
WORKDIR /app
# Install additional dependencies
RUN apk add --no-cache icu-libs tzdata
# Create non-root user
RUN adduser -D -u 1000 nexus
USER nexus
# Copy published application
COPY --from=build /app/publish .
# Add NEXUS module metadata
LABEL nexus.module.id="my-csharp-module" \
nexus.module.type="csharp" \
nexus.module.version="1.0.0"
# Health check
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
CMD dotnet MyModule.dll --health-check || exit 1
ENTRYPOINT ["dotnet", "MyModule.dll"]
2. Module Implementation with Container Support:
using Nexus.Sdk;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.DependencyInjection;
public class Program
{
public static async Task Main(string[] args)
{
// Handle health check argument
if (args.Contains("--health-check"))
{
var healthCheck = await CheckHealthAsync();
Environment.Exit(healthCheck ? 0 : 1);
}
var host = Host.CreateDefaultBuilder(args)
.ConfigureAppConfiguration((context, config) =>
{
// Load configuration from environment and mounted volumes
config.AddEnvironmentVariables("NEXUS_");
config.AddJsonFile("/config/appsettings.json", optional: true);
config.AddJsonFile($"/config/appsettings.{context.HostingEnvironment.EnvironmentName}.json", optional: true);
})
.ConfigureServices((context, services) =>
{
// Register NEXUS SDK
services.AddNexusModule(context.Configuration);
// Add health checks
services.AddHealthChecks()
.AddCheck("nexus");
// Add HTTP endpoints for probes
services.AddControllers();
})
.ConfigureWebHostDefaults(webBuilder =>
{
webBuilder.UseUrls("http://*:8080");
webBuilder.Configure(app =>
{
app.UseRouting();
app.UseEndpoints(endpoints =>
{
endpoints.MapHealthChecks("/health");
endpoints.MapHealthChecks("/ready", new HealthCheckOptions
{
Predicate = check => check.Tags.Contains("ready")
});
});
});
})
.Build();
await host.RunAsync();
}
private static async Task CheckHealthAsync()
{
try
{
using var client = new HttpClient();
var response = await client.GetAsync("http://localhost:8080/health");
return response.IsSuccessStatusCode;
}
catch
{
return false;
}
}
}
public class MyModule : ModuleBase
{
private readonly IConfiguration _configuration;
private readonly ILogger _logger;
public MyModule(IModuleContext context, IConfiguration configuration) : base(context)
{
_configuration = configuration;
_logger = context.GetLogger();
}
protected override async Task ExecuteAsync(CancellationToken cancellationToken)
{
_logger.LogInformation("Module started in container on node: {Node}",
Environment.GetEnvironmentVariable("KUBERNETES_NODE_NAME") ?? "local");
while (!cancellationToken.IsCancellationRequested)
{
try
{
// Module logic here
await ProcessDataAsync();
await Task.Delay(TimeSpan.FromSeconds(5), cancellationToken);
}
catch (Exception ex)
{
_logger.LogError(ex, "Error in module execution");
await Task.Delay(TimeSpan.FromSeconds(30), cancellationToken);
}
}
}
}
3. Build and Deploy:
# Build container image
docker build -t myregistry/my-csharp-module:1.0 .
# Push to registry
docker push myregistry/my-csharp-module:1.0
# Deploy to Kubernetes
kubectl apply -f - <<EOF
apiVersion: nexus.io/v1
kind: NexusModule
metadata:
name: my-csharp-module
namespace: nexus-modules
spec:
moduleId: my-csharp-module
type: csharp
deployment:
type: container
replicas: 3
container:
image: myregistry/my-csharp-module:1.0
resources:
limits:
cpu: "1"
memory: "1Gi"
requests:
cpu: "100m"
memory: "128Mi"
autoscaling:
enabled: true
minReplicas: 3
maxReplicas: 10
targetCPUUtilizationPercentage: 70
EOF
Python Module Containerization
1. Create Optimized Dockerfile:
# Build stage
FROM python:3.11-slim AS builder
WORKDIR /build
# Install build dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
gcc \
g++ \
&& rm -rf /var/lib/apt/lists/*
# Install Python dependencies
COPY requirements.txt .
RUN pip install --user --no-cache-dir -r requirements.txt
# Runtime stage
FROM python:3.11-slim AS runtime
WORKDIR /app
# Install runtime dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
libgomp1 \
&& rm -rf /var/lib/apt/lists/*
# Copy Python packages from builder
COPY --from=builder /root/.local /root/.local
# Copy application code
COPY . .
# Create non-root user
RUN useradd -m -u 1000 nexus && chown -R nexus:nexus /app
USER nexus
# Update PATH
ENV PATH=/root/.local/bin:$PATH
ENV PYTHONPATH=/app
# Add module metadata
LABEL nexus.module.id="my-python-module" \
nexus.module.type="python" \
nexus.module.version="1.0.0"
# Health check
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
CMD python -c "import requests; requests.get('http://localhost:8080/health').raise_for_status()"
# Run module
CMD ["python", "-m", "nexus_module"]
2. Module Implementation (nexus_module/__main__.py):
import os
import asyncio
import signal
import logging
from typing import Optional
from aiohttp import web
from nexus_sdk import Module, ModuleContext
import uvloop
# Use uvloop for better performance
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
class MyPythonModule(Module):
def __init__(self, context: ModuleContext):
super().__init__(context)
self.logger = logging.getLogger(__name__)
self.app = web.Application()
self.runner: Optional[web.AppRunner] = None
# Setup routes
self.app.router.add_get('/health', self.health_check)
self.app.router.add_get('/ready', self.ready_check)
# Load configuration
self.config = {
'batch_size': int(os.getenv('NEXUS_BATCH_SIZE', '100')),
'processing_interval': int(os.getenv('NEXUS_PROCESSING_INTERVAL', '5')),
'node_name': os.getenv('KUBERNETES_NODE_NAME', 'local')
}
async def initialize(self):
"""Initialize module and start web server"""
# Start health check server
self.runner = web.AppRunner(self.app)
await self.runner.setup()
site = web.TCPSite(self.runner, '0.0.0.0', 8080)
await site.start()
self.logger.info(f"Module initialized on node: {self.config['node_name']}")
# Subscribe to events
await self.message_bus.subscribe('data.input', self.process_data)
async def process_data(self, message):
"""Process incoming data"""
try:
data = message['data']
result = await self.perform_computation(data)
# Publish result
await self.message_bus.publish('data.output', {
'result': result,
'processed_by': self.config['node_name'],
'timestamp': message['timestamp']
})
except Exception as e:
self.logger.error(f"Processing error: {e}")
async def perform_computation(self, data):
"""Perform actual computation"""
# Simulate processing
await asyncio.sleep(0.1)
return {'processed': len(data), 'status': 'success'}
async def health_check(self, request):
"""Health check endpoint"""
return web.json_response({
'status': 'healthy',
'module': 'my-python-module',
'node': self.config['node_name']
})
async def ready_check(self, request):
"""Readiness check endpoint"""
# Check if connected to message bus
if self.message_bus.is_connected():
return web.json_response({'status': 'ready'})
else:
return web.json_response({'status': 'not ready'}, status=503)
async def shutdown(self):
"""Cleanup on shutdown"""
if self.runner:
await self.runner.cleanup()
async def main():
"""Main entry point"""
# Setup logging
logging.basicConfig(
level=os.getenv('LOG_LEVEL', 'INFO'),
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
# Create and run module
from nexus_sdk import run_module
# Handle graceful shutdown
loop = asyncio.get_event_loop()
for sig in (signal.SIGTERM, signal.SIGINT):
loop.add_signal_handler(sig, lambda: asyncio.create_task(shutdown(loop)))
await run_module(MyPythonModule)
async def shutdown(loop):
"""Graceful shutdown"""
logging.info("Received shutdown signal")
tasks = [t for t in asyncio.all_tasks() if t is not asyncio.current_task()]
[task.cancel() for task in tasks]
await asyncio.gather(*tasks, return_exceptions=True)
loop.stop()
if __name__ == '__main__':
asyncio.run(main())
3. Requirements.txt:
nexus-sdk>=1.0.0
aiohttp>=3.8.0
uvloop>=0.17.0
requests>=2.28.0
prometheus-client>=0.16.0
numpy>=1.24.0 # If needed for computation
pandas>=2.0.0 # If needed for data processing
4. Deploy with Resource Limits:
apiVersion: nexus.io/v1
kind: NexusModule
metadata:
name: my-python-module
spec:
moduleId: my-python-module
type: python
deployment:
type: container
container:
image: myregistry/my-python-module:1.0
env:
- name: NEXUS_BATCH_SIZE
value: "1000"
- name: LOG_LEVEL
value: "INFO"
resources:
limits:
cpu: "2"
memory: "2Gi"
requests:
cpu: "200m"
memory: "256Mi"
autoscaling:
enabled: true
metrics:
- type: cpu
targetAverageUtilization: 60
- type: memory
targetAverageUtilization: 80
C++ Module Containerization
1. Multi-Stage Build Dockerfile:
# Build stage with development tools
FROM ubuntu:22.04 AS builder
# Install build dependencies
RUN apt-get update && apt-get install -y \
build-essential \
cmake \
ninja-build \
git \
libprotobuf-dev \
protobuf-compiler \
libgrpc++-dev \
libboost-all-dev \
libtbb-dev \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /build
# Copy source files
COPY CMakeLists.txt .
COPY src/ src/
COPY include/ include/
# Build the module
RUN cmake -B build -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CXX_FLAGS="-march=x86-64 -mtune=generic" \
&& cmake --build build --parallel
# Runtime stage with minimal dependencies
FROM ubuntu:22.04 AS runtime
# Install only runtime dependencies
RUN apt-get update && apt-get install -y \
libprotobuf23 \
libgrpc++1 \
libboost-system1.74.0 \
libboost-thread1.74.0 \
libtbb12 \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
# Copy built binary and libraries
COPY --from=builder /build/build/bin/nexus_module /app/
COPY --from=builder /build/build/lib/*.so* /usr/local/lib/
# Update library cache
RUN ldconfig
# Create non-root user
RUN useradd -m -u 1000 nexus
USER nexus
# Add module metadata
LABEL nexus.module.id="my-cpp-module" \
nexus.module.type="cpp" \
nexus.module.version="1.0.0"
# Health check
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
CMD ["/app/nexus_module", "--health-check"]
CMD ["/app/nexus_module"]
2. CMakeLists.txt:
cmake_minimum_required(VERSION 3.20)
project(nexus-cpp-module CXX)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
# Find packages
find_package(Threads REQUIRED)
find_package(Protobuf REQUIRED)
find_package(gRPC REQUIRED)
find_package(Boost REQUIRED COMPONENTS system thread fiber)
find_package(TBB REQUIRED)
# Find NEXUS SDK
find_package(NexusSDK REQUIRED)
# Module executable
add_executable(nexus_module
src/main.cpp
src/module_impl.cpp
src/health_server.cpp
)
target_include_directories(nexus_module
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
)
target_link_libraries(nexus_module
PRIVATE
Nexus::SDK
Threads::Threads
protobuf::libprotobuf
gRPC::grpc++
Boost::system
Boost::thread
Boost::fiber
TBB::tbb
)
# Optimization flags
target_compile_options(nexus_module PRIVATE
$<$:-O3 -DNDEBUG>
-Wall -Wextra -Wpedantic
)
# Install
install(TARGETS nexus_module DESTINATION bin)
3. Module Implementation (module_impl.cpp):
#include
#include
#include
#include
#include
#include
#include
namespace nexus::modules {
class MyCppModule : public Module {
private:
std::unique_ptr message_bus_;
boost::asio::io_context io_context_;
std::unique_ptr io_thread_;
std::atomic running_{true};
// Configuration
struct Config {
size_t thread_pool_size;
size_t buffer_size;
std::string node_name;
} config_;
public:
MyCppModule(const ModuleContext& context)
: Module(context),
message_bus_(context.GetMessageBus()) {
// Load configuration
config_.thread_pool_size = context.GetConfig().Get(
"processing.threadPoolSize", std::thread::hardware_concurrency());
config_.buffer_size = context.GetConfig().Get(
"processing.bufferSize", 1024);
config_.node_name = std::getenv("KUBERNETES_NODE_NAME") ?: "local";
logger().info("C++ Module initialized on node: {}", config_.node_name);
}
void Initialize() override {
// Subscribe to messages
message_bus_->Subscribe("process.request",
[this](const Message& msg) { OnProcessRequest(msg); });
// Start IO thread
io_thread_ = std::make_unique([this] {
boost::asio::io_context::work work(io_context_);
io_context_.run();
});
// Start health check server
StartHealthCheckServer();
}
void OnProcessRequest(const Message& msg) {
auto start = std::chrono::high_resolution_clock::now();
try {
// Extract data
auto data = msg.GetField>("data");
// Process in parallel using TBB
std::vector result(data.size());
tbb::parallel_for(
tbb::blocked_range(0, data.size()),
[&](const tbb::blocked_range& range) {
for (size_t i = range.begin(); i != range.end(); ++i) {
// Perform computation
result[i] = std::sin(data[i]) * std::cos(data[i]);
}
}
);
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration(end - start);
// Publish result
Message response;
response.SetField("result", result);
response.SetField("processing_time_ms", duration.count());
response.SetField("processed_by", config_.node_name);
message_bus_->Publish("process.response", response);
logger().debug("Processed {} elements in {:.2f}ms",
data.size(), duration.count());
} catch (const std::exception& e) {
logger().error("Processing error: {}", e.what());
}
}
void StartHealthCheckServer() {
auto acceptor = std::make_shared(
io_context_,
boost::asio::ip::tcp::endpoint(boost::asio::ip::tcp::v4(), 8080)
);
AcceptHealthCheckConnection(acceptor);
}
void AcceptHealthCheckConnection(
std::shared_ptr acceptor) {
auto socket = std::make_shared(io_context_);
acceptor->async_accept(*socket,
[this, acceptor, socket](boost::system::error_code ec) {
if (!ec && running_) {
HandleHealthCheck(socket);
AcceptHealthCheckConnection(acceptor);
}
});
}
void HandleHealthCheck(std::shared_ptr socket) {
// Simple HTTP health check response
const std::string response =
"HTTP/1.1 200 OK\r\n"
"Content-Type: application/json\r\n"
"Content-Length: 17\r\n"
"\r\n"
"{\"status\":\"ok\"}";
boost::asio::async_write(*socket, boost::asio::buffer(response),
[socket](boost::system::error_code, std::size_t) {
socket->close();
});
}
void Shutdown() override {
running_ = false;
io_context_.stop();
if (io_thread_ && io_thread_->joinable()) {
io_thread_->join();
}
}
~MyCppModule() {
Shutdown();
}
};
} // namespace nexus::modules
// Module factory
extern "C" nexus::Module* CreateModule(const nexus::ModuleContext& context) {
return new nexus::modules::MyCppModule(context);
}
4. Deploy with Performance Optimization:
apiVersion: nexus.io/v1
kind: NexusModule
metadata:
name: my-cpp-module
spec:
moduleId: my-cpp-module
type: cpp
deployment:
type: container
container:
image: myregistry/my-cpp-module:1.0
resources:
limits:
cpu: "8"
memory: "8Gi"
requests:
cpu: "2"
memory: "2Gi"
env:
- name: OMP_NUM_THREADS
value: "8"
- name: TBB_NUM_THREADS
value: "8"
nodeSelector:
node.kubernetes.io/instance-type: compute-optimized
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- my-cpp-module
topologyKey: kubernetes.io/hostname
MATLAB Module Containerization
1. MATLAB Runtime Dockerfile:
# Use MATLAB Runtime base image
FROM mathworks/matlab-runtime:R2023b AS runtime
# Install additional dependencies
RUN apt-get update && apt-get install -y \
libgomp1 \
libzmq5 \
curl \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
# Copy compiled MATLAB application
COPY ./compiled/MyMatlabModule /app/MyMatlabModule
COPY ./startup.sh /app/
COPY ./config /app/config
# Copy MATLAB runtime components
COPY ./matlab-deps /app/matlab-deps
# Set permissions
RUN chmod +x /app/startup.sh \
&& chmod +x /app/MyMatlabModule/run_MyMatlabModule.sh
# Create non-root user
RUN useradd -m -u 1000 nexus && chown -R nexus:nexus /app
USER nexus
# MATLAB Runtime environment
ENV MCR_ROOT=/opt/mcr/v97
ENV LD_LIBRARY_PATH=$MCR_ROOT/runtime/glnxa64:$MCR_ROOT/bin/glnxa64:$LD_LIBRARY_PATH
ENV XAPPLRESDIR=/opt/mcr/v97/X11/app-defaults
# Add module metadata
LABEL nexus.module.id="my-matlab-module" \
nexus.module.type="matlab" \
nexus.module.version="1.0.0"
# Health check
HEALTHCHECK --interval=30s --timeout=5s --start-period=15s --retries=3 \
CMD curl -f http://localhost:8080/health || exit 1
# Use startup script to handle initialization
CMD ["/app/startup.sh"]
2. MATLAB Module Code (MyMatlabModule.m):
classdef MyMatlabModule < nexus.Module
properties (Access = private)
messageBus
config
httpServer
processingStats
end
methods
function obj = MyMatlabModule(context)
obj@nexus.Module(context);
% Initialize components
obj.messageBus = context.MessageBus;
obj.config = context.Configuration;
% Initialize statistics
obj.processingStats = struct(...
'processed', 0, ...
'errors', 0, ...
'totalTime', 0 ...
);
% Start HTTP server for health checks
obj.startHealthCheckServer();
end
function initialize(obj)
% Log startup
nodeName = getenv('KUBERNETES_NODE_NAME');
if isempty(nodeName)
nodeName = 'local';
end
obj.logger.info('MATLAB Module initialized on node: %s', nodeName);
% Subscribe to computation requests
obj.messageBus.subscribe('compute.matlab', @obj.onComputeRequest);
% Start periodic metrics reporting
timer('ExecutionMode', 'fixedRate', ...
'Period', 60, ...
'TimerFcn', @(~,~) obj.reportMetrics(), ...
'StartDelay', 60);
end
function onComputeRequest(obj, message)
startTime = tic;
try
% Extract data
inputData = message.data;
algorithm = message.algorithm;
parameters = message.parameters;
% Perform computation based on algorithm
switch algorithm
case 'fft'
result = obj.computeFFT(inputData, parameters);
case 'filter'
result = obj.applyFilter(inputData, parameters);
case 'optimization'
result = obj.runOptimization(inputData, parameters);
case 'ml_inference'
result = obj.runMLInference(inputData, parameters);
otherwise
error('Unknown algorithm: %s', algorithm);
end
% Update statistics
obj.processingStats.processed = obj.processingStats.processed + 1;
obj.processingStats.totalTime = obj.processingStats.totalTime + toc(startTime);
% Publish result
obj.messageBus.publish('compute.result', struct(...
'requestId', message.requestId, ...
'result', result, ...
'algorithm', algorithm, ...
'processingTime', toc(startTime), ...
'processedBy', getenv('HOSTNAME') ...
));
catch ME
obj.logger.error('Computation error: %s', ME.message);
obj.processingStats.errors = obj.processingStats.errors + 1;
% Publish error response
obj.messageBus.publish('compute.error', struct(...
'requestId', message.requestId, ...
'error', ME.message, ...
'algorithm', algorithm ...
));
end
end
function result = computeFFT(obj, data, params)
% Fast Fourier Transform with parameters
nfft = params.nfft;
window = params.window;
% Apply window if specified
if ~isempty(window)
switch window
case 'hamming'
data = data .* hamming(length(data));
case 'hanning'
data = data .* hanning(length(data));
case 'blackman'
data = data .* blackman(length(data));
end
end
% Compute FFT
Y = fft(data, nfft);
% Return magnitude and phase
result = struct(...
'magnitude', abs(Y), ...
'phase', angle(Y), ...
'frequency', (0:nfft-1) * params.sampleRate / nfft ...
);
end
function result = applyFilter(obj, data, params)
% Digital filtering with various filter types
filterType = params.type;
order = params.order;
switch filterType
case 'lowpass'
[b, a] = butter(order, params.cutoff/(params.sampleRate/2), 'low');
case 'highpass'
[b, a] = butter(order, params.cutoff/(params.sampleRate/2), 'high');
case 'bandpass'
[b, a] = butter(order, ...
[params.lowCutoff params.highCutoff]/(params.sampleRate/2), ...
'bandpass');
case 'notch'
wo = params.notchFreq/(params.sampleRate/2);
bw = wo/params.quality;
[b, a] = iirnotch(wo, bw);
end
% Apply filter
result = filtfilt(b, a, data);
end
function result = runOptimization(obj, data, params)
% Run optimization algorithm
objectiveFunc = params.objective;
constraints = params.constraints;
options = optimoptions(params.solver, ...
'Display', 'off', ...
'MaxIterations', params.maxIterations);
% Define objective function
switch objectiveFunc
case 'quadratic'
fun = @(x) x'*data.H*x + data.f'*x;
case 'rosenbrock'
fun = @(x) sum(100*(x(2:end)-x(1:end-1).^2).^2 + (1-x(1:end-1)).^2);
otherwise
error('Unknown objective function');
end
% Run optimization
x0 = params.initialGuess;
if isempty(constraints)
[xopt, fval, exitflag] = fminunc(fun, x0, options);
else
[xopt, fval, exitflag] = fmincon(fun, x0, ...
constraints.A, constraints.b, ...
constraints.Aeq, constraints.beq, ...
constraints.lb, constraints.ub, ...
[], options);
end
result = struct(...
'solution', xopt, ...
'objectiveValue', fval, ...
'exitFlag', exitflag ...
);
end
function result = runMLInference(obj, data, params)
% Run machine learning inference
modelPath = fullfile('/models', params.modelName);
% Load pre-trained model
if exist(modelPath, 'file')
model = load(modelPath);
% Preprocess data
processedData = obj.preprocessData(data, params.preprocessing);
% Run inference
switch params.modelType
case 'classification'
[label, score] = predict(model.classifier, processedData);
result = struct('label', label, 'confidence', max(score));
case 'regression'
prediction = predict(model.regressor, processedData);
result = struct('prediction', prediction);
case 'deeplearning'
prediction = predict(model.net, processedData);
result = struct('prediction', prediction);
end
else
error('Model not found: %s', params.modelName);
end
end
function startHealthCheckServer(obj)
% Simple HTTP server for health checks
obj.httpServer = matlab.net.http.HTTPServer;
obj.httpServer.Port = 8080;
obj.httpServer.RequestHandler = @obj.handleHealthCheck;
obj.httpServer.start();
end
function response = handleHealthCheck(obj, request)
% Handle health check requests
if strcmp(request.RequestLine.Method, 'GET') && ...
strcmp(request.RequestLine.URI.Path, '/health')
% Check module health
isHealthy = obj.messageBus.isConnected() && ...
obj.processingStats.errors < 10;
status = struct(...
'status', conditionalString(isHealthy, 'healthy', 'unhealthy'), ...
'module', 'my-matlab-module', ...
'stats', obj.processingStats ...
);
response = matlab.net.http.ResponseMessage;
response.StatusCode = matlab.net.http.StatusCode.OK;
response.Body = matlab.net.http.MessageBody(status);
response.ContentType = 'application/json';
else
response = matlab.net.http.ResponseMessage;
response.StatusCode = matlab.net.http.StatusCode.NotFound;
end
end
function reportMetrics(obj)
% Report module metrics
metrics = struct(...
'processed_total', obj.processingStats.processed, ...
'errors_total', obj.processingStats.errors, ...
'average_processing_time', ...
obj.processingStats.totalTime / max(1, obj.processingStats.processed), ...
'timestamp', datetime('now', 'TimeZone', 'UTC') ...
);
obj.messageBus.publish('metrics.matlab', metrics);
end
end
end
function str = conditionalString(condition, trueStr, falseStr)
if condition
str = trueStr;
else
str = falseStr;
end
end
3. Build Script (build_matlab_module.sh):
#!/bin/bash
# Build MATLAB module for containerization
# Compile MATLAB application
mcc -m MyMatlabModule.m \
-a ./+nexus \
-a ./lib \
-a ./config \
-d ./compiled/MyMatlabModule \
-v
# Create startup script
cat > startup.sh <<'EOF'
#!/bin/bash
# Start MATLAB module with proper environment
export MCR_CACHE_ROOT=/tmp/mcr_cache_${USER}
mkdir -p ${MCR_CACHE_ROOT}
# Run the compiled MATLAB application
exec /app/MyMatlabModule/run_MyMatlabModule.sh $MCR_ROOT
EOF
chmod +x startup.sh
# Build Docker image
docker build -t myregistry/my-matlab-module:1.0 .
# Push to registry
docker push myregistry/my-matlab-module:1.0
4. Deploy with GPU Support:
apiVersion: nexus.io/v1
kind: NexusModule
metadata:
name: my-matlab-module
spec:
moduleId: my-matlab-module
type: matlab
deployment:
type: container
container:
image: myregistry/my-matlab-module:1.0
resources:
limits:
cpu: "8"
memory: "16Gi"
nvidia.com/gpu: "1" # GPU for computation
requests:
cpu: "2"
memory: "4Gi"
volumeMounts:
- name: models
mountPath: /models
readOnly: true
- name: mcr-cache
mountPath: /tmp/mcr_cache
env:
- name: CUDA_VISIBLE_DEVICES
value: "0"
volumes:
- name: models
persistentVolumeClaim:
claimName: matlab-models-pvc
- name: mcr-cache
emptyDir:
sizeLimit: 10Gi
nodeSelector:
accelerator: nvidia-tesla-v100
LabVIEW Module Containerization
1. LabVIEW Runtime Dockerfile:
# Use NI LabVIEW Runtime base image
FROM ni/labview-runtime:2023-sp1 AS runtime
# Install additional dependencies
RUN apt-get update && apt-get install -y \
libzmq5 \
libprotobuf23 \
libusb-1.0-0 \
curl \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
# Copy LabVIEW built application
COPY ./builds/NexusModule /app/NexusModule
COPY ./config /app/config
COPY ./startup.sh /app/
# Copy NI driver configurations
COPY ./ni-drivers /etc/ni-drivers
# Set permissions
RUN chmod +x /app/startup.sh \
&& chmod +x /app/NexusModule/NexusModule
# Create non-root user (required for some NI drivers)
RUN useradd -m -u 1000 nexus && \
usermod -a -G dialout nexus && \
chown -R nexus:nexus /app
# LabVIEW Runtime environment
ENV LVRT_INSTALL_DIR=/usr/local/natinst
ENV LD_LIBRARY_PATH=$LVRT_INSTALL_DIR/lib:$LD_LIBRARY_PATH
# Add module metadata
LABEL nexus.module.id="my-labview-module" \
nexus.module.type="labview" \
nexus.module.version="1.0.0"
# Health check
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
CMD curl -f http://localhost:8080/health || exit 1
# Some operations may require root for hardware access
USER root
CMD ["/app/startup.sh"]
2. Build Configuration (build_labview_module.sh):
#!/bin/bash
# Build LabVIEW module for containerization
# Set LabVIEW environment
export LABVIEW_PATH="/usr/local/natinst/LabVIEW-2023-64"
# Build LabVIEW application
labview-cli build-app \
--project "NexusModule.lvproj" \
--build-spec "Container Build" \
--output "./builds/NexusModule" \
--log-file "./build.log"
# Create startup script
cat > startup.sh <<'EOF'
#!/bin/bash
# Initialize NI drivers if needed
if [ -d "/etc/ni-drivers" ]; then
source /etc/ni-drivers/init.sh
fi
# Set up module configuration
export NEXUS_CONFIG_FILE=${NEXUS_CONFIG_FILE:-/app/config/module.json}
# Handle health check mode
if [ "$1" = "--health-check" ]; then
curl -s http://localhost:8080/health > /dev/null
exit $?
fi
# Start LabVIEW module
exec /app/NexusModule/NexusModule \
--nexus-config "$NEXUS_CONFIG_FILE" \
--message-bus "${NEXUS_MESSAGE_BUS:-grpc://nexus-kernel:5000}"
EOF
chmod +x startup.sh
# Create module configuration
cat > config/module.json <
3. LabVIEW Module Structure:
Key LabVIEW VIs for Container Module:
- Main.vi: Entry point that initializes NEXUS connection
- MessageBusHandler.vi: Handles message bus communication
- HealthCheckServer.vi: HTTP server for container probes
- DataAcquisitionLoop.vi: Main DAQ processing loop
- ConfigurationLoader.vi: Loads JSON configuration
4. Deploy with Hardware Access:
apiVersion: nexus.io/v1
kind: NexusModule
metadata:
name: my-labview-module
spec:
moduleId: my-labview-module
type: labview
deployment:
type: container
container:
image: myregistry/my-labview-module:1.0
securityContext:
privileged: true # Required for hardware access
capabilities:
add:
- SYS_RAWIO
- SYS_ADMIN
volumeMounts:
- name: dev-bus-usb
mountPath: /dev/bus/usb
- name: ni-devices
mountPath: /dev/ni
- name: visa-config
mountPath: /etc/visa
resources:
limits:
cpu: "4"
memory: "8Gi"
# Custom resource for NI hardware
ni.com/daq-device: "1"
requests:
cpu: "1"
memory: "2Gi"
env:
- name: VISA_DEV_PATH
value: "/dev/visa"
- name: NI_MAX_CONFIG
value: "/etc/ni/max.ini"
volumes:
- name: dev-bus-usb
hostPath:
path: /dev/bus/usb
type: Directory
- name: ni-devices
hostPath:
path: /dev/ni
type: DirectoryOrCreate
- name: visa-config
configMap:
name: visa-configuration
nodeSelector:
hardware.ni.com/daq: "true"
tolerations:
- key: "hardware.ni.com/exclusive"
operator: "Equal"
value: "true"
effect: "NoSchedule"
5. Hardware Resource Management:
# ConfigMap for VISA configuration
apiVersion: v1
kind: ConfigMap
metadata:
name: visa-configuration
namespace: nexus-modules
data:
visa.ini: |
[Aliases]
DAQ1 = "USB0::0x3923::0x7514::01234567::RAW"
SCOPE1 = "TCPIP0::192.168.1.100::inst0::INSTR"
[Resources]
NumResources = 2
Resource0 = "USB0::0x3923::0x7514::01234567::RAW"
Resource1 = "TCPIP0::192.168.1.100::inst0::INSTR"
---
# Device plugin for NI hardware
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: ni-device-plugin
namespace: kube-system
spec:
selector:
matchLabels:
name: ni-device-plugin
template:
metadata:
labels:
name: ni-device-plugin
spec:
containers:
- name: ni-device-plugin
image: ni/device-plugin:latest
securityContext:
privileged: true
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: dev
mountPath: /dev
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
- name: dev
hostPath:
path: /dev
Container Orchestration with Kubernetes
Kubernetes Integration
NEXUS-1 provides native Kubernetes integration through Custom Resource Definitions (CRDs) and operators:
# Install NEXUS-1 with Helm
helm repo add nexus https://nexus-charts.io
helm install nexus-1 nexus/nexus-1 \
--namespace nexus-system \
--create-namespace \
--values values.yaml
# Deploy a module using CRD
kubectl apply -f - <<EOF
apiVersion: nexus.io/v1
kind: NexusModule
metadata:
name: signal-processor
namespace: nexus-modules
spec:
moduleId: signal-processor
name: "Signal Processing Module"
deployment:
type: container
replicas: 5
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 2
maxUnavailable: 1
container:
image: myregistry/signal-processor:1.0
imagePullSecrets:
- name: registry-credentials
autoscaling:
enabled: true
minReplicas: 5
maxReplicas: 20
metrics:
- type: cpu
targetAverageUtilization: 60
- type: memory
targetAverageUtilization: 70
- type: custom
metric:
name: message_queue_depth
selector:
matchLabels:
queue: signal-processor
targetValue: "100"
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: nexus.io/module-id
operator: In
values:
- signal-processor
topologyKey: kubernetes.io/hostname
EOF
# Check module status
kubectl get nexusmodules -n nexus-modules
kubectl describe nexusmodule signal-processor -n nexus-modules
CI/CD Integration
GitHub Actions Workflow
name: Build and Deploy NEXUS Module
on:
push:
branches: [main, develop]
tags: ['v*']
pull_request:
branches: [main]
env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}/nexus-module
jobs:
test:
runs-on: ubuntu-latest
strategy:
matrix:
language: [csharp, python, cpp]
steps:
- uses: actions/checkout@v3
- name: Set up language environment
uses: ./.github/actions/setup-${{ matrix.language }}
- name: Run tests
run: |
case ${{ matrix.language }} in
csharp) dotnet test --logger trx --results-directory test-results ;;
python) pytest --junitxml=test-results/junit.xml ;;
cpp) cmake --build build --target test ;;
esac
- name: Upload test results
uses: actions/upload-artifact@v3
with:
name: test-results-${{ matrix.language }}
path: test-results/
build:
needs: test
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
steps:
- uses: actions/checkout@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
- name: Log in to Container Registry
uses: docker/login-action@v2
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata
id: meta
uses: docker/metadata-action@v4
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
tags: |
type=ref,event=branch
type=ref,event=pr
type=semver,pattern={{version}}
type=semver,pattern={{major}}.{{minor}}
type=sha
- name: Build and push container image
uses: docker/build-push-action@v4
with:
context: .
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max
build-args: |
BUILD_VERSION=${{ github.sha }}
BUILD_DATE=${{ steps.meta.outputs.created }}
deploy:
needs: build
runs-on: ubuntu-latest
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
steps:
- uses: actions/checkout@v3
- name: Set up Kubernetes
uses: azure/setup-kubectl@v3
with:
version: 'v1.28.0'
- name: Configure Kubernetes
run: |
echo "${{ secrets.KUBE_CONFIG }}" | base64 -d > kubeconfig
export KUBECONFIG=kubeconfig
- name: Deploy to Kubernetes
run: |
# Update image in manifest
sed -i "s|IMAGE_PLACEHOLDER|${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:sha-${GITHUB_SHA::8}|g" k8s/module.yaml
# Apply manifest
kubectl apply -f k8s/module.yaml
# Wait for rollout
kubectl rollout status deployment/nexus-module -n nexus-modules --timeout=5m
- name: Run smoke tests
run: |
kubectl run smoke-test --image=curlimages/curl:latest --rm -it --restart=Never -- \
curl -f http://nexus-module.nexus-modules.svc.cluster.local:8080/health
Best Practices
Container Security Best Practices
- Image Security:
- Scan images for vulnerabilities using tools like Trivy or Snyk
- Use minimal base images (Alpine, distroless)
- Never run as root unless absolutely necessary
- Sign images with cosign or similar tools
- Runtime Security:
- Use read-only root filesystem where possible
- Drop unnecessary Linux capabilities
- Implement network policies
- Use Pod Security Standards
- Configuration Management:
- Store secrets in Kubernetes Secrets or external vaults
- Use ConfigMaps for non-sensitive configuration
- Implement proper RBAC policies
Performance Optimization
- Image Optimization:
- Use multi-stage builds to reduce image size
- Layer caching for faster builds
- Minimize layer count
- Resource Management:
- Set appropriate resource requests and limits
- Use horizontal pod autoscaling
- Implement proper health checks
- Startup Optimization:
- Optimize application startup time
- Use init containers for initialization
- Implement proper readiness probes
Monitoring and Observability
Prometheus Metrics
All containerized modules automatically expose Prometheus metrics:
# ServiceMonitor for Prometheus Operator
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: nexus-modules
namespace: nexus-modules
spec:
selector:
matchLabels:
nexus.io/module: "true"
endpoints:
- port: metrics
interval: 30s
path: /metrics
Distributed Tracing
Enable OpenTelemetry for distributed tracing:
deployment:
container:
env:
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://otel-collector.observability:4317"
- name: OTEL_SERVICE_NAME
valueFrom:
fieldRef:
fieldPath: metadata.labels['nexus.io/module-id']
- name: OTEL_TRACES_EXPORTER
value: "otlp"
- name: OTEL_METRICS_EXPORTER
value: "otlp"
Troubleshooting
Common Issues and Solutions
1. Module fails to start:
# Check pod logs
kubectl logs -n nexus-modules --previous
# Describe pod for events
kubectl describe pod -n nexus-modules
# Check resource constraints
kubectl top pod -n nexus-modules
2. Connection to message bus fails:
# Verify network policies
kubectl get networkpolicies -n nexus-modules
# Test connectivity
kubectl exec -n nexus-modules -- nc -zv nexus-kernel.nexus-system 5000
# Check service endpoints
kubectl get endpoints -n nexus-system nexus-kernel
3. Hardware access issues (LabVIEW/Instruments):
# Verify device plugin
kubectl get nodes -o json | jq '.items[].status.allocatable'
# Check security context
kubectl get pod -n nexus-modules -o jsonpath='{.spec.containers[0].securityContext}'
# Verify volume mounts
kubectl exec -n nexus-modules -- ls -la /dev/
Advanced Recovery System
Build resilient modules with self-healing capabilities using the NEXUS-1 SDK's recovery system APIs. Implement circuit breakers, retry strategies, and graceful degradation to ensure continuous operation.
Recovery System Overview
Key Recovery Concepts
- Circuit Breakers: Prevent cascading failures by temporarily disabling failing operations
- Retry Policies: Automatically retry transient failures with exponential backoff
- Health Checks: Monitor module health and trigger recovery actions
SDK Recovery APIs
Core Recovery Components
- ICircuitBreaker: Prevents cascading failures
- IRetryPolicy: Configures retry behavior
- IHealthCheck: Monitors component health
- IRecoveryStrategy: Defines recovery actions
- IBulkhead: Isolates resources to prevent exhaustion
- ISaga: Manages distributed transactions with compensation
Implementation Examples
using Nexus.SDK.Recovery;
using Polly;
public class ResilientModule : ModuleBase
{
private readonly ICircuitBreaker _circuitBreaker;
private readonly IRetryPolicy _retryPolicy;
private readonly IBulkhead _bulkhead;
public ResilientModule()
{
// Configure circuit breaker
_circuitBreaker = Recovery.CreateCircuitBreaker(
name: "external-api",
failureThreshold: 5,
samplingDuration: TimeSpan.FromMinutes(1),
minimumThroughput: 10,
breakDuration: TimeSpan.FromSeconds(30)
);
// Configure retry policy with exponential backoff
_retryPolicy = Recovery.CreateRetryPolicy(
maxAttempts: 3,
delay: TimeSpan.FromSeconds(1),
backoffMultiplier: 2.0,
maxDelay: TimeSpan.FromSeconds(30),
retryOn: new[] {
typeof(TransientException),
typeof(TimeoutException)
}
);
// Configure bulkhead for resource isolation
_bulkhead = Recovery.CreateBulkhead(
name: "database-connections",
maxConcurrency: 10,
maxQueueLength: 50
);
}
// Example: Resilient external API call
public async Task CallExternalApiAsync(ApiRequest request)
{
return await _circuitBreaker.ExecuteAsync(
operation: async () =>
{
return await _retryPolicy.ExecuteAsync(
operation: async () =>
{
using var client = new HttpClient();
client.Timeout = TimeSpan.FromSeconds(5);
var response = await client.PostAsJsonAsync(
"https://api.example.com/data",
request
);
response.EnsureSuccessStatusCode();
return await response.Content.ReadFromJsonAsync();
},
onRetry: (attempt, delay, exception) =>
{
Logger.Warning(
$"Retry attempt {attempt} after {delay}ms due to {exception.Message}"
);
}
);
},
fallback: async () =>
{
// Return cached or default response when circuit is open
Logger.Warning("Circuit breaker open, returning cached response");
return await GetCachedResponseAsync(request) ?? ApiResponse.Empty;
}
);
}
// Example: Bulkhead-protected database operation
public async Task QueryDatabaseAsync(Query query)
{
return await _bulkhead.ExecuteAsync(
operation: async () =>
{
using var connection = await Database.OpenConnectionAsync();
return await connection.QueryAsync(query);
},
onRejected: () =>
{
Logger.Warning("Bulkhead full, rejecting database query");
throw new BulkheadRejectedException();
}
);
}
// Health check implementation
protected override async Task OnHealthCheckAsync()
{
var checks = new List();
// Check circuit breaker state
checks.Add(new HealthCheckResult
{
Component = "ExternalAPI",
Status = _circuitBreaker.State == CircuitState.Closed
? HealthState.Healthy
: HealthState.Degraded,
Details = new
{
State = _circuitBreaker.State.ToString(),
FailureCount = _circuitBreaker.Metrics.FailureCount,
SuccessCount = _circuitBreaker.Metrics.SuccessCount
}
});
// Check bulkhead utilization
checks.Add(new HealthCheckResult
{
Component = "DatabasePool",
Status = _bulkhead.AvailableCount > 0
? HealthState.Healthy
: HealthState.Degraded,
Details = new
{
Available = _bulkhead.AvailableCount,
QueueLength = _bulkhead.QueueLength
}
});
// Aggregate health status
var overallHealth = checks.All(c => c.Status == HealthState.Healthy)
? HealthState.Healthy
: checks.Any(c => c.Status == HealthState.Unhealthy)
? HealthState.Unhealthy
: HealthState.Degraded;
return new HealthStatus
{
State = overallHealth,
Checks = checks,
Message = GenerateHealthMessage(checks)
};
}
// Saga pattern for distributed transactions
public async Task ProcessOrderWithSagaAsync(Order order)
{
var saga = Recovery.CreateSaga("process-order")
.AddStep(
name: "reserve-inventory",
action: async () => await Inventory.ReserveItemsAsync(order.Items),
compensation: async (result) => await Inventory.ReleaseItemsAsync(result)
)
.AddStep(
name: "charge-payment",
action: async () => await Payment.ChargeAsync(order.Payment),
compensation: async (result) => await Payment.RefundAsync(result)
)
.AddStep(
name: "ship-order",
action: async () => await Shipping.CreateShipmentAsync(order),
compensation: async (result) => await Shipping.CancelShipmentAsync(result)
)
.WithIsolation(IsolationLevel.Serializable)
.WithTimeout(TimeSpan.FromMinutes(5));
try
{
return await saga.ExecuteAsync();
}
catch (SagaException ex)
{
Logger.Error($"Order processing failed at step {ex.FailedStep}: {ex.Message}");
// Compensation already executed
throw;
}
}
}
from nexus_sdk import Module
from nexus_sdk.recovery import (
CircuitBreaker, RetryPolicy, Bulkhead,
HealthState, Saga, RecoveryError
)
import asyncio
from datetime import timedelta
import aiohttp
class ResilientModule(Module):
def __init__(self):
super().__init__()
# Configure circuit breaker
self.circuit_breaker = CircuitBreaker(
name="external_api",
failure_threshold=5,
recovery_timeout=timedelta(seconds=30),
expected_exception=aiohttp.ClientError
)
# Configure retry policy
self.retry_policy = RetryPolicy(
max_attempts=3,
delay=timedelta(seconds=1),
backoff_multiplier=2.0,
max_delay=timedelta(seconds=30),
retry_on=[aiohttp.ClientError, asyncio.TimeoutError]
)
# Configure bulkhead
self.bulkhead = Bulkhead(
name="database_connections",
max_concurrency=10,
max_queue_length=50
)
# Example: Resilient external API call
async def call_external_api(self, request):
@self.circuit_breaker
@self.retry_policy
async def _make_request():
async with aiohttp.ClientSession() as session:
async with session.post(
'https://api.example.com/data',
json=request,
timeout=aiohttp.ClientTimeout(total=5)
) as response:
response.raise_for_status()
return await response.json()
try:
return await _make_request()
except RecoveryError as e:
# Circuit is open, return cached response
self.logger.warning(f"Circuit breaker open: {e}")
return await self.get_cached_response(request) or {}
# Example: Bulkhead-protected database operation
async def query_database(self, query):
@self.bulkhead
async def _execute_query():
async with self.database.acquire() as conn:
return await conn.fetch(query)
try:
return await _execute_query()
except BulkheadFullError:
self.logger.warning("Bulkhead full, rejecting database query")
raise
# Health check implementation
async def check_health(self):
checks = []
# Check circuit breaker state
checks.append({
'component': 'external_api',
'status': (
HealthState.HEALTHY if self.circuit_breaker.closed
else HealthState.DEGRADED
),
'details': {
'state': self.circuit_breaker.current_state,
'failure_count': self.circuit_breaker.failure_count,
'last_failure': self.circuit_breaker.last_failure_time
}
})
# Check bulkhead utilization
checks.append({
'component': 'database_pool',
'status': (
HealthState.HEALTHY if self.bulkhead.available > 0
else HealthState.DEGRADED
),
'details': {
'available': self.bulkhead.available,
'queue_length': self.bulkhead.queue_length,
'active': self.bulkhead.active_count
}
})
# Aggregate health
if all(c['status'] == HealthState.HEALTHY for c in checks):
overall = HealthState.HEALTHY
elif any(c['status'] == HealthState.UNHEALTHY for c in checks):
overall = HealthState.UNHEALTHY
else:
overall = HealthState.DEGRADED
return {
'state': overall,
'checks': checks,
'timestamp': datetime.utcnow()
}
# Saga pattern for distributed transactions
async def process_order_with_saga(self, order):
saga = Saga("process_order")
# Define saga steps
@saga.step("reserve_inventory")
async def reserve():
result = await self.inventory.reserve_items(order['items'])
return result
@reserve.compensate
async def release_inventory(context):
await self.inventory.release_items(context.result)
@saga.step("charge_payment")
async def charge():
result = await self.payment.charge(order['payment'])
return result
@charge.compensate
async def refund_payment(context):
await self.payment.refund(context.result)
@saga.step("ship_order")
async def ship():
result = await self.shipping.create_shipment(order)
return result
@ship.compensate
async def cancel_shipment(context):
await self.shipping.cancel_shipment(context.result)
# Execute saga
try:
result = await saga.execute()
return result
except SagaError as e:
self.logger.error(f"Order processing failed: {e}")
# Compensation already executed
raise
# Advanced retry with circuit breaker integration
async def resilient_operation(self, operation, fallback=None):
policy = self.retry_policy.wrap(self.circuit_breaker)
try:
return await policy.execute(operation)
except Exception as e:
if fallback:
self.logger.warning(f"Operation failed, using fallback: {e}")
return await fallback()
raise
# Watchdog timer for hung operations
async def with_watchdog(self, operation, timeout=30):
try:
return await asyncio.wait_for(operation(), timeout=timeout)
except asyncio.TimeoutError:
self.logger.error(f"Operation timed out after {timeout}s")
# Trigger recovery action
await self.recover_from_timeout()
raise
#include <nexus/sdk.h>
#include <nexus/recovery.h>
#include <chrono>
#include <memory>
class ResilientModule : public nexus::ModuleBase {
private:
std::unique_ptr<nexus::CircuitBreaker> circuit_breaker_;
std::unique_ptr<nexus::RetryPolicy> retry_policy_;
std::unique_ptr<nexus::Bulkhead> bulkhead_;
public:
ResilientModule() : ModuleBase("resilient-module") {
// Configure circuit breaker
circuit_breaker_ = nexus::CircuitBreaker::create("external-api")
->with_failure_threshold(5)
->with_sampling_duration(std::chrono::minutes(1))
->with_minimum_throughput(10)
->with_break_duration(std::chrono::seconds(30));
// Configure retry policy
retry_policy_ = nexus::RetryPolicy::create()
->with_max_attempts(3)
->with_delay(std::chrono::seconds(1))
->with_backoff_multiplier(2.0)
->with_max_delay(std::chrono::seconds(30))
->with_retry_on<TransientException>()
->with_retry_on<TimeoutException>();
// Configure bulkhead
bulkhead_ = nexus::Bulkhead::create("database-connections")
->with_max_concurrency(10)
->with_max_queue_length(50);
}
// Example: Resilient external API call
future<ApiResponse> call_external_api_async(const ApiRequest& request) {
return circuit_breaker_->execute_async(
[this, request]() {
return retry_policy_->execute_async(
[this, request]() {
return make_api_call(request);
}
);
},
[this]() {
// Fallback when circuit is open
logger()->warn("Circuit breaker open, using cached response");
return get_cached_response();
}
);
}
// Example: Bulkhead-protected database operation
future<QueryResult> query_database_async(const std::string& query) {
return bulkhead_->execute_async(
[this, query]() {
return database_->execute_query_async(query);
}
);
}
// Health check implementation
HealthStatus check_health() override {
std::vector<HealthCheckResult> checks;
// Check circuit breaker state
checks.push_back({
.component = "ExternalAPI",
.status = circuit_breaker_->is_closed()
? HealthState::Healthy
: HealthState::Degraded,
.details = {
{"state", circuit_breaker_->state_string()},
{"failure_count", circuit_breaker_->metrics().failure_count},
{"success_count", circuit_breaker_->metrics().success_count}
}
});
// Check bulkhead utilization
checks.push_back({
.component = "DatabasePool",
.status = bulkhead_->available_count() > 0
? HealthState::Healthy
: HealthState::Degraded,
.details = {
{"available", bulkhead_->available_count()},
{"queue_length", bulkhead_->queue_length()}
}
});
// Aggregate health
auto overall = std::all_of(checks.begin(), checks.end(),
[](const auto& c) { return c.status == HealthState::Healthy; })
? HealthState::Healthy
: std::any_of(checks.begin(), checks.end(),
[](const auto& c) { return c.status == HealthState::Unhealthy; })
? HealthState::Unhealthy
: HealthState::Degraded;
return {
.state = overall,
.checks = checks,
.message = generate_health_message(checks)
};
}
// Saga pattern for distributed transactions
future<OrderResult> process_order_with_saga_async(const Order& order) {
auto saga = nexus::Saga<OrderResult>::create("process-order")
->add_step("reserve-inventory",
[this, &order]() {
return inventory_->reserve_items_async(order.items);
},
[this](const auto& result) {
return inventory_->release_items_async(result);
})
->add_step("charge-payment",
[this, &order]() {
return payment_->charge_async(order.payment);
},
[this](const auto& result) {
return payment_->refund_async(result);
})
->add_step("ship-order",
[this, &order]() {
return shipping_->create_shipment_async(order);
},
[this](const auto& result) {
return shipping_->cancel_shipment_async(result);
})
->with_isolation(IsolationLevel::Serializable)
->with_timeout(std::chrono::minutes(5));
return saga->execute_async()
.then([this](auto result) {
return result;
})
.on_error([this](SagaException e) {
logger()->error("Order processing failed at step {}: {}",
e.failed_step(), e.what());
// Compensation already executed
throw e;
});
}
};
classdef ResilientModule < nexus.Module
properties (Access = private)
circuitBreaker
retryPolicy
bulkhead
end
methods
function obj = ResilientModule()
obj@nexus.Module('resilient-module');
% Configure circuit breaker
obj.circuitBreaker = nexus.CircuitBreaker('external-api');
obj.circuitBreaker.failureThreshold = 5;
obj.circuitBreaker.samplingDuration = minutes(1);
obj.circuitBreaker.minimumThroughput = 10;
obj.circuitBreaker.breakDuration = seconds(30);
% Configure retry policy
obj.retryPolicy = nexus.RetryPolicy();
obj.retryPolicy.maxAttempts = 3;
obj.retryPolicy.delay = seconds(1);
obj.retryPolicy.backoffMultiplier = 2.0;
obj.retryPolicy.maxDelay = seconds(30);
obj.retryPolicy.retryOn = {'TransientException', 'TimeoutException'};
% Configure bulkhead
obj.bulkhead = nexus.Bulkhead('database-connections');
obj.bulkhead.maxConcurrency = 10;
obj.bulkhead.maxQueueLength = 50;
end
function response = callExternalApi(obj, request)
% Resilient external API call
response = obj.circuitBreaker.execute( ...
@() obj.retryPolicy.execute( ...
@() obj.makeApiCall(request) ...
), ...
@() obj.getCachedResponse() ... % Fallback
);
end
function result = queryDatabase(obj, query)
% Bulkhead-protected database operation
result = obj.bulkhead.execute( ...
@() obj.database.executeQuery(query) ...
);
end
function health = checkHealth(obj)
% Health check implementation
checks = [];
% Check circuit breaker state
cbCheck.component = 'ExternalAPI';
if obj.circuitBreaker.isClosed()
cbCheck.status = nexus.HealthState.Healthy;
else
cbCheck.status = nexus.HealthState.Degraded;
end
cbCheck.details = struct( ...
'state', obj.circuitBreaker.state, ...
'failureCount', obj.circuitBreaker.metrics.failureCount, ...
'successCount', obj.circuitBreaker.metrics.successCount ...
);
checks = [checks, cbCheck];
% Check bulkhead utilization
bhCheck.component = 'DatabasePool';
if obj.bulkhead.availableCount > 0
bhCheck.status = nexus.HealthState.Healthy;
else
bhCheck.status = nexus.HealthState.Degraded;
end
bhCheck.details = struct( ...
'available', obj.bulkhead.availableCount, ...
'queueLength', obj.bulkhead.queueLength ...
);
checks = [checks, bhCheck];
% Aggregate health
healthyCount = sum([checks.status] == nexus.HealthState.Healthy);
unhealthyCount = sum([checks.status] == nexus.HealthState.Unhealthy);
if healthyCount == length(checks)
overallHealth = nexus.HealthState.Healthy;
elseif unhealthyCount > 0
overallHealth = nexus.HealthState.Unhealthy;
else
overallHealth = nexus.HealthState.Degraded;
end
health = struct( ...
'state', overallHealth, ...
'checks', checks, ...
'message', obj.generateHealthMessage(checks) ...
);
end
function result = processOrderWithSaga(obj, order)
% Saga pattern for distributed transactions
saga = nexus.Saga('process-order');
% Add saga steps
saga.addStep('reserve-inventory', ...
@() obj.inventory.reserveItems(order.items), ...
@(result) obj.inventory.releaseItems(result));
saga.addStep('charge-payment', ...
@() obj.payment.charge(order.payment), ...
@(result) obj.payment.refund(result));
saga.addStep('ship-order', ...
@() obj.shipping.createShipment(order), ...
@(result) obj.shipping.cancelShipment(result));
saga.isolationLevel = nexus.IsolationLevel.Serializable;
saga.timeout = minutes(5);
try
result = saga.execute();
catch ex
if isa(ex, 'nexus.SagaException')
obj.logger.error('Order processing failed at step %s: %s', ...
ex.failedStep, ex.message);
% Compensation already executed
end
rethrow(ex);
end
end
end
methods (Access = private)
function response = makeApiCall(obj, request)
% Implement actual API call
response = webwrite('https://api.example.com/data', request);
end
function response = getCachedResponse(obj)
% Return cached response as fallback
response = obj.cache.get('api-response-default');
end
function message = generateHealthMessage(obj, checks)
% Generate health status message
unhealthy = {checks([checks.status] == nexus.HealthState.Unhealthy).component};
degraded = {checks([checks.status] == nexus.HealthState.Degraded).component};
if ~isempty(unhealthy)
message = sprintf('Unhealthy components: %s', strjoin(unhealthy, ', '));
elseif ~isempty(degraded)
message = sprintf('Degraded components: %s', strjoin(degraded, ', '));
else
message = 'All components healthy';
end
end
end
end
// LabVIEW Recovery System Implementation
// Using NEXUS-1 SDK Recovery VIs
// === ResilientModule.lvclass ===
// Private Data:
// - Circuit Breaker (CircuitBreaker.lvclass)
// - Retry Policy (RetryPolicy.lvclass)
// - Bulkhead (Bulkhead.lvclass)
// === Initialize.vi ===
// 1. Create Circuit Breaker:
// - Name: "external-api"
// - Failure Threshold: 5
// - Sampling Duration: 60000 ms
// - Minimum Throughput: 10
// - Break Duration: 30000 ms
//
// 2. Create Retry Policy:
// - Max Attempts: 3
// - Initial Delay: 1000 ms
// - Backoff Multiplier: 2.0
// - Max Delay: 30000 ms
// - Retry On: ["TransientException", "TimeoutException"]
//
// 3. Create Bulkhead:
// - Name: "database-connections"
// - Max Concurrency: 10
// - Max Queue Length: 50
// === CallExternalAPI.vi ===
// Inputs:
// - Request (Cluster)
// Outputs:
// - Response (Cluster)
// - Error Out
//
// Implementation:
// 1. Circuit Breaker Execute VI:
// - Operation: Retry Policy Execute VI
// - Operation: Make API Call SubVI
// - Fallback: Get Cached Response SubVI
//
// 2. Error Handling:
// - If circuit open: Use fallback
// - If retry exhausted: Return error
// - Log all failures
// === QueryDatabase.vi ===
// Inputs:
// - Query (String)
// Outputs:
// - Result (Variant)
// - Error Out
//
// Implementation:
// 1. Bulkhead Execute VI:
// - Operation: Database Query SubVI
// - Queue if at capacity
// - Reject if queue full
//
// 2. Timeout handling:
// - Set query timeout
// - Cancel on timeout
// - Return timeout error
// === CheckHealth.vi ===
// Outputs:
// - Health Status (Cluster):
// - State (Enum: Healthy/Degraded/Unhealthy)
// - Checks (Array of Check Results)
// - Message (String)
//
// Implementation:
// 1. Check Circuit Breaker:
// - Get State VI
// - Get Metrics VI
// - Build Check Result
//
// 2. Check Bulkhead:
// - Get Available Count VI
// - Get Queue Length VI
// - Build Check Result
//
// 3. Aggregate Health:
// - For Loop over checks
// - Determine overall state
// - Generate message
// === ProcessOrderWithSaga.vi ===
// Inputs:
// - Order (Cluster)
// Outputs:
// - Result (Cluster)
// - Error Out
//
// Saga Implementation:
// 1. Create Saga:
// - Name: "process-order"
// - Isolation: Serializable
// - Timeout: 300000 ms (5 min)
//
// 2. Add Steps (using Saga Builder):
// Step 1: "reserve-inventory"
// - Action: Reserve Items SubVI
// - Compensation: Release Items SubVI
//
// Step 2: "charge-payment"
// - Action: Charge Payment SubVI
// - Compensation: Refund Payment SubVI
//
// Step 3: "ship-order"
// - Action: Create Shipment SubVI
// - Compensation: Cancel Shipment SubVI
//
// 3. Execute Saga:
// - Run all steps in sequence
// - On failure: Run compensations in reverse
// - Log each step result
// === Recovery Utilities ===
// 1. Exponential Backoff Calculator:
// delay = min(initial * (multiplier ^ attempt), maxDelay)
// jitter = delay * Random(1 - jitter, 1 + jitter)
//
// 2. Circuit State Machine:
// CLOSED -> OPEN (on threshold)
// OPEN -> HALF_OPEN (on timeout)
// HALF_OPEN -> CLOSED (on success)
// HALF_OPEN -> OPEN (on failure)
//
// 3. Bulkhead Semaphore:
// - Acquire before operation
// - Release after completion
// - Queue when at capacity
Recovery Best Practices
Design for Failure
- Fail Fast: Don't wait for timeouts when failure is certain
- Graceful Degradation: Provide reduced functionality rather than complete failure
- Idempotency: Ensure operations can be safely retried
- Timeout Everything: Never wait indefinitely for external resources
- Monitor Recovery: Track recovery metrics and patterns
- Test Failure Scenarios: Regularly test recovery mechanisms
- Document Recovery Behavior: Make recovery strategies visible to operators
- Avoid Retry Storms: Use exponential backoff and jitter
- Resource Isolation: Use bulkheads to prevent resource exhaustion
- Clear Error Messages: Provide actionable information when recovery fails
Security
Implement robust security measures in your NEXUS-1 modules to protect industrial systems from threats, ensure data integrity, and maintain compliance with security standards.
Authentication
Verify the identity of users, services, and modules before granting access to system resources.
Token-Based Authentication
using System;
using System.IdentityModel.Tokens.Jwt;
using Microsoft.IdentityModel.Tokens;
using System.Security.Claims;
using System.Text;
public class AuthenticationModule : ModuleBase
{
private readonly IConfiguration _config;
private readonly ILogger _logger;
private readonly string _jwtSecret;
public override async Task InitializeAsync(IModuleContext context)
{
_config = context.Configuration;
_logger = context.Logger;
_jwtSecret = _config.GetValue("Security:JwtSecret");
// Subscribe to authentication requests
await MessageBus.SubscribeAsync("auth/request/*", HandleAuthRequest);
_logger.Information("Authentication module initialized");
}
private async Task HandleAuthRequest(AuthRequest request)
{
try
{
// Validate credentials
var user = await ValidateCredentials(request.Username, request.Password);
if (user == null)
{
throw new UnauthorizedAccessException("Invalid credentials");
}
// Generate JWT token
var token = GenerateJwtToken(user);
// Publish response
await MessageBus.PublishAsync(new AuthResponse
{
Success = true,
Token = token,
ExpiresIn = 3600,
UserId = user.Id,
Roles = user.Roles
});
// Audit successful login
await Audit.SecurityEventAsync("LOGIN_SUCCESS", $"User {user.Username} logged in");
}
catch (Exception ex)
{
_logger.Error(ex, "Authentication failed");
await Audit.SecurityEventAsync("LOGIN_FAILURE", $"Failed login attempt for {request.Username}");
throw;
}
}
private string GenerateJwtToken(User user)
{
var tokenHandler = new JwtSecurityTokenHandler();
var key = Encoding.ASCII.GetBytes(_jwtSecret);
var claims = new List
{
new Claim(ClaimTypes.Name, user.Username),
new Claim(ClaimTypes.NameIdentifier, user.Id),
new Claim("module_access", "true")
};
// Add role claims
foreach (var role in user.Roles)
{
claims.Add(new Claim(ClaimTypes.Role, role));
}
var tokenDescriptor = new SecurityTokenDescriptor
{
Subject = new ClaimsIdentity(claims),
Expires = DateTime.UtcNow.AddHours(1),
SigningCredentials = new SigningCredentials(
new SymmetricSecurityKey(key),
SecurityAlgorithms.HmacSha256Signature)
};
var token = tokenHandler.CreateToken(tokenDescriptor);
return tokenHandler.WriteToken(token);
}
private async Task ValidateCredentials(string username, string password)
{
// Implement your credential validation logic
// This could check against AD, database, etc.
var user = await _userService.GetUserAsync(username);
if (user != null && _passwordHasher.VerifyPassword(password, user.PasswordHash))
{
return user;
}
return null;
}
}
import jwt
import datetime
import hashlib
import secrets
from typing import Dict, List, Optional
class AuthenticationModule(ModuleBase):
def __init__(self):
super().__init__()
self.jwt_secret = None
self.token_expiry = 3600 # 1 hour
async def initialize_async(self, context: ModuleContext):
"""Initialize the authentication module"""
self.jwt_secret = context.configuration.get_value("Security:JwtSecret")
if not self.jwt_secret:
raise ValueError("JWT secret not configured")
# Subscribe to authentication requests
await self.message_bus.subscribe_async(
"auth/request/*",
self._handle_auth_request
)
self.logger.info("Authentication module initialized")
async def _handle_auth_request(self, request: Dict, context: MessageContext):
"""Handle authentication requests"""
try:
username = request.get('username')
password = request.get('password')
# Validate credentials
user = await self._validate_credentials(username, password)
if not user:
raise UnauthorizedError("Invalid credentials")
# Generate JWT token
token = self._generate_jwt_token(user)
# Publish response
await self.message_bus.publish_async({
'success': True,
'token': token,
'expires_in': self.token_expiry,
'user_id': user['id'],
'roles': user.get('roles', [])
}, topic=f"auth/response/{context.correlation_id}")
# Audit successful login
await self.audit.security_event(
"LOGIN_SUCCESS",
f"User {username} logged in",
user_id=user['id']
)
except Exception as e:
self.logger.error(f"Authentication failed: {str(e)}")
await self.audit.security_event(
"LOGIN_FAILURE",
f"Failed login attempt for {username}"
)
# Send failure response
await self.message_bus.publish_async({
'success': False,
'error': str(e)
}, topic=f"auth/response/{context.correlation_id}")
def _generate_jwt_token(self, user: Dict) -> str:
"""Generate a JWT token for the user"""
payload = {
'user_id': user['id'],
'username': user['username'],
'roles': user.get('roles', []),
'module_access': True,
'exp': datetime.datetime.utcnow() + datetime.timedelta(seconds=self.token_expiry),
'iat': datetime.datetime.utcnow(),
'jti': secrets.token_hex(16) # JWT ID for revocation
}
return jwt.encode(
payload,
self.jwt_secret,
algorithm='HS256'
)
async def _validate_credentials(self, username: str, password: str) -> Optional[Dict]:
"""Validate user credentials"""
# Implement your credential validation logic
# This could check against AD, database, etc.
user = await self.user_service.get_user(username)
if user and self._verify_password(password, user['password_hash']):
return user
return None
def _verify_password(self, password: str, password_hash: str) -> bool:
"""Verify password against hash"""
# Using a simple example - use bcrypt or similar in production
return hashlib.sha256(password.encode()).hexdigest() == password_hash
def verify_token(self, token: str) -> Optional[Dict]:
"""Verify and decode a JWT token"""
try:
payload = jwt.decode(
token,
self.jwt_secret,
algorithms=['HS256']
)
return payload
except jwt.ExpiredSignatureError:
self.logger.warning("Token expired")
return None
except jwt.InvalidTokenError as e:
self.logger.warning(f"Invalid token: {str(e)}")
return None
#include
#include
#include
#include
class AuthenticationModule : public ModuleBase {
private:
std::string jwt_secret_;
std::chrono::seconds token_expiry_{3600}; // 1 hour
public:
async_task initialize_async(std::shared_ptr context) override {
jwt_secret_ = co_await context->configuration()->get_value("Security:JwtSecret");
if (jwt_secret_.empty()) {
throw std::runtime_error("JWT secret not configured");
}
// Subscribe to authentication requests
co_await message_bus_->subscribe_async(
"auth/request/*",
[this](auto msg, auto ctx) { return handle_auth_request(msg, ctx); }
);
logger_->info("Authentication module initialized");
}
private:
async_task handle_auth_request(
const AuthRequest& request,
const MessageContext& context) {
try {
// Validate credentials
auto user = co_await validate_credentials(request.username, request.password);
if (!user) {
throw unauthorized_error("Invalid credentials");
}
// Generate JWT token
auto token = generate_jwt_token(*user);
// Publish response
AuthResponse response{
.success = true,
.token = token,
.expires_in = token_expiry_.count(),
.user_id = user->id,
.roles = user->roles
};
co_await message_bus_->publish_async(
response,
MessageOptions{.topic = fmt::format("auth/response/{}", context.correlation_id)}
);
// Audit successful login
co_await audit_->security_event_async(
"LOGIN_SUCCESS",
fmt::format("User {} logged in", user->username)
);
} catch (const std::exception& e) {
logger_->error("Authentication failed: {}", e.what());
co_await audit_->security_event_async(
"LOGIN_FAILURE",
fmt::format("Failed login attempt for {}", request.username)
);
// Send failure response
AuthResponse response{
.success = false,
.error = e.what()
};
co_await message_bus_->publish_async(
response,
MessageOptions{.topic = fmt::format("auth/response/{}", context.correlation_id)}
);
}
}
std::string generate_jwt_token(const User& user) {
auto now = std::chrono::system_clock::now();
auto exp_time = now + token_expiry_;
// Create JWT token
auto token = jwt::create()
.set_issuer("nexus-auth")
.set_type("JWT")
.set_payload_claim("user_id", jwt::claim(user.id))
.set_payload_claim("username", jwt::claim(user.username))
.set_payload_claim("module_access", jwt::claim(std::string("true")))
.set_payload_claim("roles", jwt::claim(user.roles))
.set_issued_at(now)
.set_expires_at(exp_time)
.set_id(generate_jti())
.sign(jwt::algorithm::hs256{jwt_secret_});
return token;
}
async_task> validate_credentials(
const std::string& username,
const std::string& password) {
// Implement your credential validation logic
// This could check against AD, database, etc.
auto user = co_await user_service_->get_user_async(username);
if (user && verify_password(password, user->password_hash)) {
co_return user;
}
co_return std::nullopt;
}
bool verify_password(const std::string& password, const std::string& hash) {
// Use proper password hashing library like bcrypt in production
unsigned char result[SHA256_DIGEST_LENGTH];
SHA256(reinterpret_cast(password.c_str()),
password.length(), result);
std::stringstream ss;
for(int i = 0; i < SHA256_DIGEST_LENGTH; i++) {
ss << std::hex << std::setw(2) << std::setfill('0') << (int)result[i];
}
return ss.str() == hash;
}
std::string generate_jti() {
// Generate unique JWT ID
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<> dis(0, 15);
const char* hex_chars = "0123456789abcdef";
std::string jti;
jti.reserve(32);
for (int i = 0; i < 32; ++i) {
jti += hex_chars[dis(gen)];
}
return jti;
}
public:
std::optional> verify_token(
const std::string& token) {
try {
auto decoded = jwt::decode(token);
auto verifier = jwt::verify()
.allow_algorithm(jwt::algorithm::hs256{jwt_secret_})
.with_issuer("nexus-auth")
.with_type("JWT");
verifier.verify(decoded);
return decoded;
} catch (const std::exception& e) {
logger_->warning("Invalid token: {}", e.what());
return std::nullopt;
}
}
};
classdef AuthenticationModule < nexus.Module
properties (Access = private)
jwtSecret
tokenExpiry = 3600 % 1 hour in seconds
userService
end
methods
function obj = AuthenticationModule()
obj@nexus.Module('Authentication Module', '1.0.0');
end
function onInitialize(obj)
% Get JWT secret from configuration
obj.jwtSecret = obj.config().getValue('Security.JwtSecret');
if isempty(obj.jwtSecret)
error('JWT secret not configured');
end
% Initialize user service
obj.userService = UserService(obj.config());
% Subscribe to authentication requests
obj.subscribe('auth.request.*', @obj.handleAuthRequest);
obj.Logger.info('Authentication module initialized');
end
function onStart(obj)
% Module started
obj.Logger.info('Authentication module started');
end
function onStop(obj)
% Clean up resources
obj.Logger.info('Authentication module stopped');
end
function health = onCheckHealth(obj)
health = nexus.HealthStatus();
health.State = nexus.HealthState.Healthy;
health.Message = 'Authentication service is healthy';
end
function handleAuthRequest(obj, message, context)
try
request = message.Payload;
username = request.username;
password = request.password;
% Validate credentials
user = obj.validateCredentials(username, password);
if isempty(user)
throw(MException('Auth:InvalidCredentials', 'Invalid credentials'));
end
% Generate JWT token
token = obj.generateJwtToken(user);
% Create response
response = struct(...
'success', true, ...
'token', token, ...
'expiresIn', obj.tokenExpiry, ...
'userId', user.id, ...
'roles', {user.roles} ...
);
% Publish response
obj.publish(response, nexus.MessageOptions(...
'Topic', sprintf('auth.response.%s', context.CorrelationId)));
% Audit successful login
obj.audit().securityEvent('LOGIN_SUCCESS', ...
sprintf('User %s logged in', username)) ...
.byUser(user.id) ...
.record();
catch ME
obj.Logger.error('Authentication failed', struct('error', ME.message));
% Audit failed login
obj.audit().securityEvent('LOGIN_FAILURE', ...
sprintf('Failed login attempt for %s', username)) ...
.withProperty('reason', ME.message) ...
.record();
% Send failure response
response = struct(...
'success', false, ...
'error', ME.message ...
);
obj.publish(response, nexus.MessageOptions(...
'Topic', sprintf('auth.response.%s', context.CorrelationId)));
end
end
function token = generateJwtToken(obj, user)
% Create JWT header
header = struct('alg', 'HS256', 'typ', 'JWT');
headerJson = jsonencode(header);
headerBase64 = matlab.net.base64encode(headerJson);
% Create JWT payload
now = posixtime(datetime('now', 'TimeZone', 'UTC'));
payload = struct(...
'user_id', user.id, ...
'username', user.username, ...
'roles', {user.roles}, ...
'module_access', true, ...
'iat', now, ...
'exp', now + obj.tokenExpiry, ...
'jti', char(java.util.UUID.randomUUID()) ...
);
payloadJson = jsonencode(payload);
payloadBase64 = matlab.net.base64encode(payloadJson);
% Create signature
message = sprintf('%s.%s', headerBase64, payloadBase64);
signature = obj.hmacSha256(message, obj.jwtSecret);
signatureBase64 = matlab.net.base64encode(signature);
% Combine to create token
token = sprintf('%s.%s.%s', headerBase64, payloadBase64, signatureBase64);
end
function user = validateCredentials(obj, username, password)
% Implement credential validation
user = obj.userService.getUser(username);
if ~isempty(user) && obj.verifyPassword(password, user.passwordHash)
return;
end
user = [];
end
function valid = verifyPassword(~, password, passwordHash)
% In production, use proper password hashing like bcrypt
% This is a simplified example
hash = mlreportgen.utils.hash(password, 'SHA-256');
valid = strcmp(hash, passwordHash);
end
function signature = hmacSha256(~, message, key)
% Generate HMAC-SHA256 signature
import javax.crypto.*
import javax.crypto.spec.*
import java.security.*
keySpec = SecretKeySpec(uint8(key), 'HmacSHA256');
mac = Mac.getInstance('HmacSHA256');
mac.init(keySpec);
signature = typecast(mac.doFinal(uint8(message)), 'uint8');
end
function payload = verifyToken(obj, token)
% Verify and decode JWT token
try
parts = strsplit(token, '.');
if numel(parts) ~= 3
error('Invalid token format');
end
% Verify signature
message = sprintf('%s.%s', parts{1}, parts{2});
expectedSignature = obj.hmacSha256(message, obj.jwtSecret);
actualSignature = matlab.net.base64decode(parts{3});
if ~isequal(expectedSignature, actualSignature)
error('Invalid token signature');
end
% Decode payload
payloadJson = matlab.net.base64decode(parts{2});
payload = jsondecode(char(payloadJson));
% Check expiration
now = posixtime(datetime('now', 'TimeZone', 'UTC'));
if now > payload.exp
error('Token expired');
end
catch ME
obj.Logger.warning('Invalid token', struct('error', ME.message));
payload = [];
end
end
end
// Authentication Module Implementation in LabVIEW
// AuthenticationModule.lvclass - Main module class
// Private Data Cluster:
// - JWTSecret (String) - Secret key for JWT signing
// - TokenExpiry (I32) - Token expiry time in seconds (default: 3600)
// - UserService (UserService.lvclass) - User validation service
// - Logger (Logger.lvclass) - Module logger
// Initialize.vi Override:
Inputs:
- Module Context (ModuleContext.lvclass)
- Error In
Process:
1. Call Parent Initialize
2. Get JWT Secret from Config
- Key: "Security.JwtSecret"
- Error if not found
3. Initialize User Service
4. Subscribe to Authentication
- Topic: "auth.request.*"
- Handler: Handle Auth Request.vi
5. Log "Authentication module initialized"
Outputs:
- Error Out
// Handle Auth Request.vi
Inputs:
- Message (Variant) - Auth request message
- Context (MessageContext) - Message context
- Error In
Process:
1. Convert Message to Auth Request
- Username (String)
- Password (String)
2. Validate Credentials
- Call Validate Credentials.vi
- If invalid, create error
3. If Valid:
- Generate JWT Token.vi
- Create Success Response
* success: true
* token: JWT string
* expiresIn: 3600
* userId: User ID
* roles: User roles array
- Publish to "auth.response.[correlationId]"
- Audit successful login
4. If Invalid:
- Create Failure Response
* success: false
* error: "Invalid credentials"
- Publish to "auth.response.[correlationId]"
- Audit failed login
// Generate JWT Token.vi
Inputs:
- User (User.ctl) - User information
- JWT Secret (String)
- Token Expiry (I32)
- Error In
Process:
1. Create JWT Header
- alg: "HS256"
- typ: "JWT"
- Convert to JSON
- Base64 encode
2. Create JWT Payload
- user_id: User ID
- username: Username
- roles: User roles
- module_access: true
- iat: Current Unix timestamp
- exp: Current + expiry
- jti: Random UUID
- Convert to JSON
- Base64 encode
3. Create Signature
- Message = header + "." + payload
- HMAC-SHA256 with secret
- Base64 encode
4. Combine Token
- token = header + "." + payload + "." + signature
Outputs:
- JWT Token (String)
- Error Out
// Validate Credentials.vi
Inputs:
- Username (String)
- Password (String)
- User Service (UserService.lvclass)
- Error In
Process:
1. Get User from Service
- Query by username
2. If User Found:
- Verify Password Hash
- Use bcrypt or similar
3. Return User or Empty
Outputs:
- User (User.ctl) - Empty if invalid
- Valid (Boolean)
- Error Out
// Verify Token.vi
Inputs:
- Token (String)
- JWT Secret (String)
- Error In
Process:
1. Split Token by "."
- Must have 3 parts
2. Verify Signature
- Recreate signature from header.payload
- Compare with token signature
3. Decode Payload
- Base64 decode
- JSON parse
4. Check Expiration
- Compare exp with current time
5. Return Payload or Error
Outputs:
- Payload (Variant) - Token claims
- Valid (Boolean)
- Error Out
// HMAC-SHA256.vi - Utility VI
Inputs:
- Message (String)
- Key (String)
- Error In
Process:
1. Use .NET System.Security.Cryptography
2. Create HMACSHA256 object
3. Set key
4. Compute hash of message
5. Return byte array
Outputs:
- Signature (U8 Array)
- Error Out
// Example Usage in Another Module:
// Request Authentication
1. Create Auth Request
- username: "operator1"
- password: "secure_password"
2. Generate Correlation ID
3. Subscribe to Response
- Topic: "auth.response.[correlationId]"
- Timeout: 5 seconds
4. Publish Request
- Topic: "auth.request.login"
- Payload: Auth request
5. Wait for Response
- Check success flag
- Store token if successful
- Handle error if failed
// Using Token for Requests
1. Add to Message Headers
- Authorization: "Bearer [token]"
2. Verify Token in Receiving Module
- Extract from headers
- Call Verify Token.vi
- Check roles/permissions
// Token Refresh Pattern
1. Monitor Token Expiry
- Check exp claim
- Refresh before expiry
2. Request New Token
- Use refresh token if available
- Or re-authenticate
// Security Best Practices:
// - Store JWT secret securely (use Key Vault)
// - Use HTTPS for all communications
// - Implement token revocation list
// - Log all authentication attempts
// - Use strong password hashing (bcrypt/scrypt)
// - Implement rate limiting
// - Monitor for suspicious patterns
Data Encryption
Protect sensitive data in transit and at rest using industry-standard encryption algorithms and best practices.
Message Encryption
using System;
using System.IO;
using System.Security.Cryptography;
using System.Text;
public class EncryptionModule : ModuleBase
{
private readonly byte[] _encryptionKey;
private readonly byte[] _iv;
public override async Task InitializeAsync(IModuleContext context)
{
await base.InitializeAsync(context);
// Load encryption key from secure configuration
var keyBase64 = Configuration.GetValue<string>("Security:EncryptionKey");
_encryptionKey = Convert.FromBase64String(keyBase64);
// Generate or load IV
_iv = GenerateIV();
// Subscribe to encryption requests
await MessageBus.SubscribeAsync("crypto/encrypt/*", HandleEncryptRequest);
await MessageBus.SubscribeAsync("crypto/decrypt/*", HandleDecryptRequest);
_logger.Information("Encryption module initialized");
}
public byte[] EncryptData(byte[] plainData)
{
using (var aes = Aes.Create())
{
aes.Key = _encryptionKey;
aes.IV = _iv;
aes.Mode = CipherMode.CBC;
aes.Padding = PaddingMode.PKCS7;
using (var encryptor = aes.CreateEncryptor())
using (var ms = new MemoryStream())
{
// Write IV to the beginning
ms.Write(_iv, 0, _iv.Length);
using (var cs = new CryptoStream(ms, encryptor, CryptoStreamMode.Write))
{
cs.Write(plainData, 0, plainData.Length);
cs.FlushFinalBlock();
}
return ms.ToArray();
}
}
}
public byte[] DecryptData(byte[] encryptedData)
{
using (var aes = Aes.Create())
{
aes.Key = _encryptionKey;
using (var ms = new MemoryStream(encryptedData))
{
// Read IV from the beginning
byte[] iv = new byte[16];
ms.Read(iv, 0, 16);
aes.IV = iv;
aes.Mode = CipherMode.CBC;
aes.Padding = PaddingMode.PKCS7;
using (var decryptor = aes.CreateDecryptor())
using (var cs = new CryptoStream(ms, decryptor, CryptoStreamMode.Read))
using (var output = new MemoryStream())
{
cs.CopyTo(output);
return output.ToArray();
}
}
}
}
// Encrypt sensitive message payloads
public async Task<Message> EncryptMessageAsync(Message message)
{
var payload = message.GetPayload<object>();
var json = JsonSerializer.Serialize(payload);
var plainBytes = Encoding.UTF8.GetBytes(json);
var encryptedBytes = EncryptData(plainBytes);
return new Message
{
Topic = message.Topic,
Headers = new Dictionary<string, string>(message.Headers)
{
["X-Encrypted"] = "true",
["X-Encryption-Algorithm"] = "AES-256-CBC"
},
Payload = Convert.ToBase64String(encryptedBytes),
CorrelationId = message.CorrelationId
};
}
// Sign messages for integrity verification
public string SignMessage(byte[] data)
{
using (var hmac = new HMACSHA256(_encryptionKey))
{
var signature = hmac.ComputeHash(data);
return Convert.ToBase64String(signature);
}
}
public bool VerifySignature(byte[] data, string signature)
{
var expectedSignature = SignMessage(data);
return expectedSignature == signature;
}
private byte[] GenerateIV()
{
using (var rng = RandomNumberGenerator.Create())
{
var iv = new byte[16]; // AES block size
rng.GetBytes(iv);
return iv;
}
}
}
import base64
import json
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
from cryptography.hazmat.primitives import hashes, hmac, padding
from cryptography.hazmat.backends import default_backend
import os
class EncryptionModule(ModuleBase):
def __init__(self):
super().__init__()
self.encryption_key = None
self.backend = default_backend()
async def initialize(self, context: ModuleContext):
"""Initialize encryption module"""
await super().initialize(context)
# Load encryption key from secure configuration
key_base64 = self.config.get("Security.EncryptionKey")
self.encryption_key = base64.b64decode(key_base64)
# Subscribe to encryption/decryption requests
await self.message_bus.subscribe("crypto/encrypt/*", self.handle_encrypt_request)
await self.message_bus.subscribe("crypto/decrypt/*", self.handle_decrypt_request)
self.logger.info("Encryption module initialized")
def encrypt_data(self, plain_data: bytes) -> bytes:
"""Encrypt data using AES-256-CBC"""
# Generate random IV
iv = os.urandom(16)
# Create cipher
cipher = Cipher(
algorithms.AES(self.encryption_key),
modes.CBC(iv),
backend=self.backend
)
encryptor = cipher.encryptor()
# Pad data to block size
padder = padding.PKCS7(128).padder()
padded_data = padder.update(plain_data) + padder.finalize()
# Encrypt
encrypted = encryptor.update(padded_data) + encryptor.finalize()
# Prepend IV to encrypted data
return iv + encrypted
def decrypt_data(self, encrypted_data: bytes) -> bytes:
"""Decrypt data using AES-256-CBC"""
# Extract IV from beginning
iv = encrypted_data[:16]
ciphertext = encrypted_data[16:]
# Create cipher
cipher = Cipher(
algorithms.AES(self.encryption_key),
modes.CBC(iv),
backend=self.backend
)
decryptor = cipher.decryptor()
# Decrypt
padded_plain = decryptor.update(ciphertext) + decryptor.finalize()
# Remove padding
unpadder = padding.PKCS7(128).unpadder()
plain = unpadder.update(padded_plain) + unpadder.finalize()
return plain
async def encrypt_message(self, message: Message) -> Message:
"""Encrypt message payload"""
# Serialize payload
payload_json = json.dumps(message.get_payload())
plain_bytes = payload_json.encode('utf-8')
# Encrypt
encrypted_bytes = self.encrypt_data(plain_bytes)
# Create encrypted message
encrypted_message = Message(
topic=message.topic,
headers={
**message.headers,
"X-Encrypted": "true",
"X-Encryption-Algorithm": "AES-256-CBC"
},
payload=base64.b64encode(encrypted_bytes).decode('utf-8'),
correlation_id=message.correlation_id
)
return encrypted_message
def sign_message(self, data: bytes) -> str:
"""Sign data using HMAC-SHA256"""
h = hmac.HMAC(self.encryption_key, hashes.SHA256(), backend=self.backend)
h.update(data)
signature = h.finalize()
return base64.b64encode(signature).decode('utf-8')
def verify_signature(self, data: bytes, signature: str) -> bool:
"""Verify HMAC-SHA256 signature"""
expected_signature = self.sign_message(data)
return expected_signature == signature
async def handle_encrypt_request(self, message: Message):
"""Handle encryption requests"""
try:
data = message.get_payload()
if isinstance(data, str):
data = data.encode('utf-8')
elif isinstance(data, dict):
data = json.dumps(data).encode('utf-8')
encrypted = self.encrypt_data(data)
await self.message_bus.publish(
f"crypto/encrypted/{message.correlation_id}",
{
"data": base64.b64encode(encrypted).decode('utf-8'),
"algorithm": "AES-256-CBC"
}
)
except Exception as e:
self.logger.error(f"Encryption failed: {str(e)}")
await self.message_bus.publish(
f"crypto/error/{message.correlation_id}",
{"error": str(e)}
)
#include <nexus/module.hpp>
#include <openssl/evp.h>
#include <openssl/aes.h>
#include <openssl/hmac.h>
#include <openssl/rand.h>
#include <vector>
#include <string>
class EncryptionModule : public nexus::ModuleBase {
private:
std::vector encryption_key_;
public:
nexus::Status Initialize(const nexus::ModuleContext& context) override {
nexus::Status status = ModuleBase::Initialize(context);
if (!status.ok()) return status;
// Load encryption key
auto key_base64 = config_->GetString("Security.EncryptionKey");
encryption_key_ = Base64Decode(key_base64);
// Subscribe to crypto requests
message_bus_->Subscribe("crypto/encrypt/*",
[this](const nexus::Message& msg) {
return HandleEncryptRequest(msg);
});
logger_->Info("Encryption module initialized");
return nexus::Status::OK;
}
std::vector EncryptData(const std::vector& plain_data) {
// Generate random IV
std::vector iv(AES_BLOCK_SIZE);
RAND_bytes(iv.data(), AES_BLOCK_SIZE);
// Create cipher context
EVP_CIPHER_CTX* ctx = EVP_CIPHER_CTX_new();
EVP_EncryptInit_ex(ctx, EVP_aes_256_cbc(), nullptr,
encryption_key_.data(), iv.data());
// Encrypt data
std::vector encrypted(plain_data.size() + AES_BLOCK_SIZE);
int len;
int ciphertext_len;
EVP_EncryptUpdate(ctx, encrypted.data(), &len,
plain_data.data(), plain_data.size());
ciphertext_len = len;
EVP_EncryptFinal_ex(ctx, encrypted.data() + len, &len);
ciphertext_len += len;
EVP_CIPHER_CTX_free(ctx);
// Prepend IV to encrypted data
encrypted.resize(ciphertext_len);
encrypted.insert(encrypted.begin(), iv.begin(), iv.end());
return encrypted;
}
std::vector DecryptData(const std::vector& encrypted_data) {
// Extract IV
std::vector iv(encrypted_data.begin(),
encrypted_data.begin() + AES_BLOCK_SIZE);
// Create cipher context
EVP_CIPHER_CTX* ctx = EVP_CIPHER_CTX_new();
EVP_DecryptInit_ex(ctx, EVP_aes_256_cbc(), nullptr,
encryption_key_.data(), iv.data());
// Decrypt data
std::vector decrypted(encrypted_data.size());
int len;
int plaintext_len;
EVP_DecryptUpdate(ctx, decrypted.data(), &len,
encrypted_data.data() + AES_BLOCK_SIZE,
encrypted_data.size() - AES_BLOCK_SIZE);
plaintext_len = len;
EVP_DecryptFinal_ex(ctx, decrypted.data() + len, &len);
plaintext_len += len;
EVP_CIPHER_CTX_free(ctx);
decrypted.resize(plaintext_len);
return decrypted;
}
std::string SignMessage(const std::vector& data) {
unsigned char* digest = HMAC(EVP_sha256(),
encryption_key_.data(),
encryption_key_.size(),
data.data(),
data.size(),
nullptr, nullptr);
return Base64Encode(digest, 32);
}
bool VerifySignature(const std::vector& data,
const std::string& signature) {
auto expected = SignMessage(data);
return expected == signature;
}
};
classdef EncryptionModule < nexus.ModuleBase
properties (Access = private)
encryptionKey
cipher
end
methods
function obj = EncryptionModule()
obj@nexus.ModuleBase();
end
function initialize(obj, context)
% Initialize encryption module
initialize@nexus.ModuleBase(obj, context);
% Load encryption key
keyBase64 = obj.config.getString('Security.EncryptionKey');
obj.encryptionKey = matlab.net.base64decode(keyBase64);
% Create cipher
obj.cipher = matlab.security.Cipher('AES', 'CBC', 'PKCS5');
% Subscribe to crypto requests
obj.messageBus.subscribe('crypto/encrypt/*', @obj.handleEncryptRequest);
obj.messageBus.subscribe('crypto/decrypt/*', @obj.handleDecryptRequest);
obj.logger.info('Encryption module initialized');
end
function encrypted = encryptData(obj, plainData)
% Encrypt data using AES-256-CBC
% Generate random IV
iv = randi([0 255], 1, 16, 'uint8');
% Encrypt
encrypted = obj.cipher.encrypt(plainData, obj.encryptionKey, iv);
% Prepend IV
encrypted = [iv, encrypted];
end
function plainData = decryptData(obj, encryptedData)
% Decrypt data using AES-256-CBC
% Extract IV
iv = encryptedData(1:16);
ciphertext = encryptedData(17:end);
% Decrypt
plainData = obj.cipher.decrypt(ciphertext, obj.encryptionKey, iv);
end
function encryptedMsg = encryptMessage(obj, message)
% Encrypt message payload
payload = message.getPayload();
% Convert to JSON string
jsonStr = jsonencode(payload);
plainBytes = uint8(jsonStr);
% Encrypt
encryptedBytes = obj.encryptData(plainBytes);
% Create encrypted message
encryptedMsg = nexus.Message();
encryptedMsg.topic = message.topic;
encryptedMsg.headers = message.headers;
encryptedMsg.headers('X-Encrypted') = 'true';
encryptedMsg.headers('X-Encryption-Algorithm') = 'AES-256-CBC';
encryptedMsg.payload = matlab.net.base64encode(encryptedBytes);
encryptedMsg.correlationId = message.correlationId;
end
function signature = signMessage(obj, data)
% Sign data using HMAC-SHA256
hmac = matlab.security.HMAC('SHA256');
signatureBytes = hmac.compute(data, obj.encryptionKey);
signature = matlab.net.base64encode(signatureBytes);
end
function isValid = verifySignature(obj, data, signature)
% Verify HMAC-SHA256 signature
expectedSignature = obj.signMessage(data);
isValid = strcmp(expectedSignature, signature);
end
function handleEncryptRequest(obj, message)
% Handle encryption requests
try
data = message.getPayload();
% Convert to bytes if needed
if ischar(data) || isstring(data)
dataBytes = uint8(char(data));
elseif isstruct(data)
dataBytes = uint8(jsonencode(data));
else
dataBytes = uint8(data);
end
% Encrypt
encrypted = obj.encryptData(dataBytes);
% Publish result
result = struct(...
'data', matlab.net.base64encode(encrypted), ...
'algorithm', 'AES-256-CBC' ...
);
obj.messageBus.publish(...
sprintf('crypto/encrypted/%s', message.correlationId), ...
result ...
);
catch ME
obj.logger.error(['Encryption failed: ' ME.message]);
errorResult = struct('error', ME.message);
obj.messageBus.publish(...
sprintf('crypto/error/%s', message.correlationId), ...
errorResult ...
);
end
end
end
end
// LabVIEW Encryption Module Implementation
// File: EncryptionModule.lvclass
// Class Private Data
// - EncryptionKey: Array
// - CipherRef: AES Cipher Reference
// Initialize Method
Begin Initialize
// Load encryption key
KeyBase64 = Config.GetString("Security.EncryptionKey")
EncryptionKey = Base64Decode(KeyBase64)
// Initialize AES cipher
CipherRef = AES.CreateCipher("AES-256-CBC")
// Subscribe to crypto requests
MessageBus.Subscribe("crypto/encrypt/*", HandleEncryptRequest.vi)
MessageBus.Subscribe("crypto/decrypt/*", HandleDecryptRequest.vi)
Logger.Info("Encryption module initialized")
End Initialize
// Encrypt Data Method
// Inputs: PlainData (Array)
// Outputs: EncryptedData (Array)
Begin EncryptData
// Generate random IV (16 bytes)
IV = GenerateRandomBytes(16)
// Set cipher parameters
AES.SetKey(CipherRef, EncryptionKey)
AES.SetIV(CipherRef, IV)
// Pad data to block size
PaddedData = PKCS7Pad(PlainData, 16)
// Encrypt
CipherText = AES.Encrypt(CipherRef, PaddedData)
// Prepend IV to encrypted data
EncryptedData = Concatenate(IV, CipherText)
Return EncryptedData
End EncryptData
// Decrypt Data Method
// Inputs: EncryptedData (Array)
// Outputs: PlainData (Array)
Begin DecryptData
// Extract IV (first 16 bytes)
IV = EncryptedData[0:15]
CipherText = EncryptedData[16:end]
// Set cipher parameters
AES.SetKey(CipherRef, EncryptionKey)
AES.SetIV(CipherRef, IV)
// Decrypt
PaddedPlain = AES.Decrypt(CipherRef, CipherText)
// Remove padding
PlainData = PKCS7Unpad(PaddedPlain)
Return PlainData
End DecryptData
// Encrypt Message Method
// Inputs: Message
// Outputs: EncryptedMessage
Begin EncryptMessage
// Get payload and convert to JSON
Payload = Message.GetPayload()
JsonString = JSONEncode(Payload)
PlainBytes = StringToU8Array(JsonString)
// Encrypt
EncryptedBytes = EncryptData(PlainBytes)
// Create encrypted message
EncryptedMessage.Topic = Message.Topic
EncryptedMessage.Headers = Message.Headers
EncryptedMessage.Headers["X-Encrypted"] = "true"
EncryptedMessage.Headers["X-Encryption-Algorithm"] = "AES-256-CBC"
EncryptedMessage.Payload = Base64Encode(EncryptedBytes)
EncryptedMessage.CorrelationId = Message.CorrelationId
Return EncryptedMessage
End EncryptMessage
// Sign Message Method
// Inputs: Data (Array)
// Outputs: Signature (String)
Begin SignMessage
// Compute HMAC-SHA256
HMAC = ComputeHMAC(Data, EncryptionKey, "SHA256")
// Convert to Base64
Signature = Base64Encode(HMAC)
Return Signature
End SignMessage
// Verify Signature Method
// Inputs: Data (Array), Signature (String)
// Outputs: IsValid (Boolean)
Begin VerifySignature
// Compute expected signature
ExpectedSignature = SignMessage(Data)
// Compare
IsValid = (ExpectedSignature == Signature)
Return IsValid
End VerifySignature
Secrets Management
Securely store and manage sensitive configuration data such as API keys, certificates, and passwords.
Best Practices for Secrets
- Never Hard-code Secrets: Always load from secure configuration or environment variables
- Use Key Vaults: Integrate with enterprise key management systems when available
- Rotate Regularly: Implement automatic key rotation policies
- Audit Access: Log all access to sensitive configuration
- Encrypt at Rest: Store secrets encrypted in configuration files
Secure Configuration Example
# nexus-manifest.yaml
modules:
- name: secure-module
type: process
language: csharp
assembly: SecureModule.dll
configuration:
# Never store secrets in plain text
apiKey: ${SECURE_API_KEY} # From environment
databaseConnection: ${vault://database/connection} # From key vault
encryption:
keyPath: /secure/keys/module.key # Protected file path
algorithm: AES-256-GCM
Security Best Practices
- Defense in Depth: Implement multiple layers of security controls
- Principle of Least Privilege: Grant minimal permissions required
- Input Validation: Always validate and sanitize input data
- Secure Communication: Use TLS for all network communication
- Regular Updates: Keep dependencies and libraries up to date
- Security Testing: Include security tests in your CI/CD pipeline
- Incident Response: Have a plan for security incidents
- Compliance: Follow industry standards (IEC 62443, ISO 27001)
Enterprise Authentication
NEXUS-1 provides enterprise-grade authentication integration, allowing modules to seamlessly work with your organization's existing identity infrastructure.
Authentication Overview
Supported Authentication Providers
NEXUS-1 supports multiple authentication methods to integrate with enterprise identity systems:
Available Authentication Types
- Active Directory (AD) - Native Windows domain authentication
- LDAP - Generic LDAP directory services
- OAuth2/OIDC - Modern authentication (Azure AD, Okta, Auth0)
- SAML 2.0 - Enterprise single sign-on
- Certificate-Based - Smart card and X.509 authentication
- Multi-Factor (MFA) - Additional security layer
Module Authentication Context
When enterprise authentication is enabled, your module receives authenticated user context automatically:
public class SecureModule : ModuleBase
{
protected override async Task OnInitializeAsync()
{
// Access current user context
var user = Context.CurrentUser;
Logger.LogInformation("Module initialized by user: {Username} ({UserId})",
user.Username, user.UserId);
// Check user roles
if (user.IsInRole("Administrator"))
{
EnableAdminFeatures();
}
// Access user attributes from directory
var department = user.Attributes["department"];
var email = user.Email;
// Check specific permissions
if (await user.HasPermissionAsync("data.modify"))
{
EnableDataModification();
}
}
// Authorize operations based on user context
public async Task ExecuteCriticalOperation(string operation)
{
var user = Context.CurrentUser;
// Require specific role
if (!user.IsInRole("Operator"))
{
Logger.LogWarning("User {User} lacks Operator role for {Operation}",
user.Username, operation);
return false;
}
// Audit the operation
await Context.AuditService.RecordOperationAsync(
user.UserId,
operation,
new { Timestamp = DateTime.UtcNow },
"Critical operation executed");
return true;
}
}
from nexus.sdk import ModuleBase
from datetime import datetime
class SecureModule(ModuleBase):
async def on_initialize(self):
# Access current user context
user = self.context.current_user
self.logger.info(f"Module initialized by user: {user.username} ({user.user_id})")
# Check user roles
if user.is_in_role("Administrator"):
self.enable_admin_features()
# Access user attributes from directory
department = user.attributes.get("department")
email = user.email
# Check specific permissions
if await user.has_permission("data.modify"):
self.enable_data_modification()
# Authorize operations based on user context
async def execute_critical_operation(self, operation: str) -> bool:
user = self.context.current_user
# Require specific role
if not user.is_in_role("Operator"):
self.logger.warning(
f"User {user.username} lacks Operator role for {operation}"
)
return False
# Audit the operation
await self.context.audit_service.record_operation(
user.user_id,
operation,
{"timestamp": datetime.utcnow()},
"Critical operation executed"
)
return True
class SecureModule : public ModuleBase {
protected:
async_task OnInitializeAsync() override {
// Access current user context
auto user = context()->current_user();
logger()->info("Module initialized by user: {} ({})",
user->username(), user->user_id());
// Check user roles
if (user->is_in_role("Administrator")) {
EnableAdminFeatures();
}
// Access user attributes from directory
auto department = user->attributes()["department"];
auto email = user->email();
// Check specific permissions
if (co_await user->has_permission_async("data.modify")) {
EnableDataModification();
}
}
// Authorize operations based on user context
async_task ExecuteCriticalOperation(const std::string& operation) {
auto user = context()->current_user();
// Require specific role
if (!user->is_in_role("Operator")) {
logger()->warn("User {} lacks Operator role for {}",
user->username(), operation);
co_return false;
}
// Audit the operation
co_await context()->audit_service()->record_operation_async(
user->user_id(),
operation,
json{{"timestamp", std::chrono::system_clock::now()}},
"Critical operation executed");
co_return true;
}
};
Role-Based Access Control (RBAC)
NEXUS-1 maps enterprise directory groups to application roles automatically:
# Example role mapping in nexus-manifest.yaml
authentication:
providers:
- type: active-directory
roleMapping:
"Domain Admins": Administrator
"NEXUS Operators": Operator
"NEXUS Engineers": Engineer
"Quality Control": QualityInspector
"Domain Users": User
Security Best Practices
- Always check user permissions before sensitive operations
- Audit all security-relevant actions
- Use role-based checks rather than individual user checks
- Cache permission checks for performance (they're automatically refreshed)
Implementing Permission Checks
Declarative Security
Use attributes to declare required permissions on module methods:
public class DataProcessingModule : ModuleBase
{
[RequirePermission("data.read")]
public async Task ReadDataAsync(string dataId)
{
// Method only executes if user has 'data.read' permission
return await LoadDataAsync(dataId);
}
[RequireRole("Engineer", "Administrator")]
public async Task ModifyConfigurationAsync(Configuration config)
{
// Only Engineers or Administrators can execute this
await UpdateConfigurationAsync(config);
}
[RequireAuthentication]
[AuditOperation("DeleteData")]
public async Task DeleteDataAsync(string dataId, string reason)
{
// Requires any authenticated user and automatically audits
await PerformDeleteAsync(dataId);
}
}
Programmatic Security
For dynamic permission checks based on runtime conditions:
public async Task ProcessSensitiveData(string dataId)
{
var user = Context.CurrentUser;
var data = await LoadDataAsync(dataId);
// Dynamic permission check based on data classification
string requiredPermission = data.Classification switch
{
"Public" => "data.read",
"Internal" => "data.read.internal",
"Confidential" => "data.read.confidential",
"Secret" => "data.read.secret",
_ => throw new InvalidOperationException($"Unknown classification: {data.Classification}")
};
if (!await user.HasPermissionAsync(requiredPermission))
{
Logger.LogWarning("User {User} lacks permission {Permission} for data {DataId}",
user.Username, requiredPermission, dataId);
// Audit the access denial
await Context.AuditService.RecordOperationAsync(
user.UserId,
"AccessDenied",
new { DataId = dataId, RequiredPermission = requiredPermission },
severity: AuditSeverity.Warning);
return false;
}
// Process the data
await ProcessDataInternalAsync(data);
return true;
}
Session Management
User Session Information
Access detailed session information for security and audit purposes:
public class SessionAwareModule : ModuleBase
{
protected override async Task OnInitializeAsync()
{
var session = Context.CurrentSession;
// Session details
Logger.LogInformation("Session started at: {StartTime}", session.StartTime);
Logger.LogInformation("Authentication method: {Method}", session.AuthenticationMethod);
Logger.LogInformation("Client IP: {IP}", session.ClientIpAddress);
Logger.LogInformation("Workstation: {Workstation}", session.Workstation);
// Check for concurrent sessions
if (session.ConcurrentSessionCount > 1)
{
Logger.LogWarning("User has {Count} concurrent sessions",
session.ConcurrentSessionCount);
}
// Session events
session.OnExpiring += async (sender, args) =>
{
// Notify user of impending session expiration
await NotifySessionExpiringAsync(args.TimeRemaining);
};
session.OnRenewed += (sender, args) =>
{
Logger.LogInformation("Session renewed until: {NewExpiry}", args.NewExpiryTime);
};
}
public async Task RequireReauthentication(string reason)
{
// Force re-authentication for sensitive operations
var result = await Context.CurrentSession.ReauthenticateAsync(reason);
if (result.Success)
{
await Context.AuditService.RecordOperationAsync(
Context.CurrentUser.UserId,
"Reauthentication",
new { Reason = reason, Success = true },
severity: AuditSeverity.Security);
}
return result.Success;
}
}
Compliance & Audit
NEXUS-1 provides comprehensive audit trail capabilities that meet stringent regulatory requirements including FDA 21 CFR Part 11, ISO 27001, and other compliance standards.
Regulatory Compliance
Supported Standards
- FDA 21 CFR Part 11 - Electronic records and signatures
- ISO 27001 - Information security management
- SOC 2 - Service organization controls
- GDPR - Data protection and privacy
- GxP - Good practices (GMP, GLP, GCP)
- ISO 13485 - Medical devices quality management
- IEC 62304 - Medical device software lifecycle
Audit Trail Requirements
NEXUS-1's audit system ensures:
- Completeness - All relevant operations are recorded
- Accuracy - Timestamps are from trusted sources
- Immutability - Records cannot be modified or deleted
- Attribution - Every action is linked to a user
- Integrity - Cryptographic verification of audit chains
Recording Audit Events
Basic Audit Recording
Record operations with full compliance tracking:
public class QualityControlModule : ModuleBase
{
public async Task ExecuteQualityTest(
string productId,
TestParameters parameters)
{
var user = Context.CurrentUser;
// Record test initiation
var auditRecord = await Context.AuditService.RecordOperationAsync(
userId: user.UserId,
operation: "QualityTest.Started",
data: new
{
ProductId = productId,
TestType = parameters.TestType,
Parameters = parameters,
StartTime = DateTime.UtcNow
},
reason: parameters.Reason,
severity: AuditSeverity.Information
);
try
{
// Execute test
var result = await PerformTestAsync(productId, parameters);
// Record test completion
await Context.AuditService.RecordOperationAsync(
userId: user.UserId,
operation: "QualityTest.Completed",
data: new
{
ProductId = productId,
Result = result,
Duration = result.Duration,
ParentAuditId = auditRecord.Id
},
severity: result.Passed ? AuditSeverity.Information : AuditSeverity.Warning
);
// If test failed, require additional documentation
if (!result.Passed)
{
await RecordTestFailureDetailsAsync(productId, result);
}
return result;
}
catch (Exception ex)
{
// Record test failure
await Context.AuditService.RecordOperationAsync(
userId: user.UserId,
operation: "QualityTest.Failed",
data: new
{
ProductId = productId,
Error = ex.Message,
ParentAuditId = auditRecord.Id
},
severity: AuditSeverity.Error
);
throw;
}
}
private async Task RecordTestFailureDetailsAsync(
string productId,
TestResult result)
{
// Require reason for failure
var reason = await Context.UI.PromptForReasonAsync(
"Test Failure Documentation",
"Please provide detailed reason for test failure:");
if (string.IsNullOrWhiteSpace(reason))
{
throw new ComplianceException(
"Reason required for test failures per 21 CFR Part 11");
}
await Context.AuditService.RecordOperationAsync(
userId: Context.CurrentUser.UserId,
operation: "QualityTest.FailureDocumented",
data: new
{
ProductId = productId,
FailureReason = reason,
FailureDetails = result.FailureDetails
},
reason: reason,
severity: AuditSeverity.Warning
);
}
}
from nexus.sdk import ModuleBase, AuditSeverity
from datetime import datetime
class QualityControlModule(ModuleBase):
async def execute_quality_test(
self,
product_id: str,
parameters: TestParameters) -> TestResult:
user = self.context.current_user
# Record test initiation
audit_record = await self.context.audit_service.record_operation(
user_id=user.user_id,
operation="QualityTest.Started",
data={
"product_id": product_id,
"test_type": parameters.test_type,
"parameters": parameters.to_dict(),
"start_time": datetime.utcnow()
},
reason=parameters.reason,
severity=AuditSeverity.INFORMATION
)
try:
# Execute test
result = await self.perform_test(product_id, parameters)
# Record test completion
await self.context.audit_service.record_operation(
user_id=user.user_id,
operation="QualityTest.Completed",
data={
"product_id": product_id,
"result": result.to_dict(),
"duration": result.duration,
"parent_audit_id": audit_record.id
},
severity=(
AuditSeverity.INFORMATION
if result.passed
else AuditSeverity.WARNING
)
)
# If test failed, require additional documentation
if not result.passed:
await self.record_test_failure_details(product_id, result)
return result
except Exception as ex:
# Record test failure
await self.context.audit_service.record_operation(
user_id=user.user_id,
operation="QualityTest.Failed",
data={
"product_id": product_id,
"error": str(ex),
"parent_audit_id": audit_record.id
},
severity=AuditSeverity.ERROR
)
raise
Electronic Signatures
Implement 21 CFR Part 11 compliant electronic signatures:
public async Task ApproveTestResults(
string testId,
TestResult results,
ApprovalLevel level)
{
// Require electronic signature for approval
var signatureRequest = new ElectronicSignatureRequest
{
Operation = "TestApproval",
Description = $"Approve test results for Test ID: {testId}",
RequirePassword = true, // 21 CFR Part 11 requirement
RequireReason = true,
MinimumRole = level == ApprovalLevel.Final ? "QualityManager" : "QualityInspector"
};
var signature = await Context.RequestElectronicSignatureAsync(signatureRequest);
if (!signature.IsValid)
{
Logger.LogWarning("Electronic signature failed for test {TestId}", testId);
return false;
}
// Record the approval with signature
var auditRecord = await Context.AuditService.RecordOperationAsync(
userId: signature.UserId,
operation: "TestResults.Approved",
data: new
{
TestId = testId,
Results = results,
ApprovalLevel = level,
SignatureId = signature.Id,
SignatureHash = signature.Hash
},
reason: signature.Reason,
digitalSignature: signature.SignatureData,
severity: AuditSeverity.Information
);
// Store signature reference with test results
results.Approvals.Add(new Approval
{
Level = level,
UserId = signature.UserId,
UserName = signature.UserName,
Timestamp = signature.Timestamp,
SignatureId = signature.Id,
AuditRecordId = auditRecord.Id
});
// For final approval, may require second signature
if (level == ApprovalLevel.Final &&
Context.Configuration.GetValue("RequireDualApproval"))
{
var secondSignature = await RequestSecondSignatureAsync(testId, results);
if (!secondSignature.IsValid)
{
await RollbackApprovalAsync(auditRecord.Id);
return false;
}
}
return true;
}
Audit Trail Integrity
Blockchain-Style Audit Chain
NEXUS-1 uses cryptographic chaining to ensure audit trail integrity:
// Each audit record contains:
public class AuditRecord
{
public Guid Id { get; set; }
public DateTime Timestamp { get; set; } // Trusted timestamp
public string UserId { get; set; } // Who
public string Operation { get; set; } // What
public string DataHash { get; set; } // SHA-256 of data
public string PreviousRecordHash { get; set; } // Chain link
public string RecordHash { get; set; } // This record's hash
// The hash includes all fields, creating an immutable chain
// Any tampering breaks the chain and is immediately detectable
}
Verifying Audit Integrity
Modules can verify audit trail integrity for compliance reports:
public async Task GenerateComplianceReportAsync(
DateTime startDate,
DateTime endDate)
{
// Verify audit trail integrity first
var integrityResult = await Context.AuditService.VerifyAuditIntegrityAsync(
startDate, endDate);
if (!integrityResult.IsValid)
{
throw new ComplianceException(
$"Audit trail integrity check failed: {integrityResult.Issues.Count} issues found");
}
// Generate compliance report
var report = await Context.AuditService.GenerateComplianceReportAsync(
ComplianceStandard.FDA_21CFR11,
startDate,
endDate);
// Add module-specific compliance data
report.Sections.Add("QualityTests", new ComplianceSection
{
Title = "Quality Test Compliance",
Metrics = new Dictionary
{
["TotalTests"] = await CountTestsAsync(startDate, endDate),
["SignedResults"] = await CountSignedResultsAsync(startDate, endDate),
["FailureRate"] = await CalculateFailureRateAsync(startDate, endDate),
["AverageApprovalTime"] = await CalculateAvgApprovalTimeAsync(startDate, endDate)
},
ComplianceStatus = ComplianceStatus.Compliant
});
return report;
}
Data Retention and Export
Compliance-Ready Export
Export audit trails in regulatory-compliant formats:
public async Task ExportAuditTrailAsync(ExportRequest request)
{
// Ensure user has permission to export audit data
if (!await Context.CurrentUser.HasPermissionAsync("audit.export"))
{
throw new SecurityException("Insufficient permissions for audit export");
}
// Record the export operation itself
await Context.AuditService.RecordOperationAsync(
Context.CurrentUser.UserId,
"AuditTrail.Exported",
new
{
DateRange = new { request.StartDate, request.EndDate },
Format = request.Format,
Purpose = request.Purpose
},
reason: request.Reason,
severity: AuditSeverity.Security
);
// Export in requested format
return await Context.AuditService.ExportAuditRecordsAsync(
request.StartDate,
request.EndDate,
request.Format switch
{
"FDA" => ExportFormat.FDA_XML,
"ISO" => ExportFormat.ISO_27001_Report,
"PDF" => ExportFormat.PDF,
_ => ExportFormat.JSON
}
);
}
Important Compliance Notes
- Audit records are immutable - they cannot be modified or deleted
- All timestamps use trusted time sources (NTP synchronized)
- Electronic signatures require user authentication at time of signing
- Audit trails are retained according to configured retention policies
- Export operations are themselves audited for compliance
Digital Signatures
Implement cryptographic signatures to ensure data integrity, authenticity, and non-repudiation for critical operations and test results.
Digital Signature Overview
Why Digital Signatures?
Digital signatures provide:
- Authenticity - Proves who signed the data
- Integrity - Detects any tampering with signed data
- Non-repudiation - Signer cannot deny having signed
- Compliance - Meets regulatory requirements for electronic records
- Legal validity - Legally binding in most jurisdictions
Signature Algorithms
NEXUS-1 supports industry-standard signature algorithms:
- RSA-SHA256/384/512 - RSA signatures with SHA-2 hashing
- ECDSA-SHA256/384/512 - Elliptic curve signatures
- RSA-PSS - Probabilistic signature scheme
Signing Data
Basic Data Signing
Sign any data object with your module's certificate or user certificate:
public class TestResultModule : ModuleBase
{
public async Task GenerateSignedTestResultAsync(
string productId,
TestData testData)
{
// Perform the test
var result = await ExecuteTestAsync(productId, testData);
// Create test result object
var testResult = new TestResult
{
Id = Guid.NewGuid(),
ProductId = productId,
TestDate = DateTime.UtcNow,
TestType = testData.TestType,
Results = result,
PassFail = result.MeetsSpecification ? "PASS" : "FAIL",
TestedBy = Context.CurrentUser.Username,
ModuleId = ModuleInfo.Id,
ModuleVersion = ModuleInfo.Version
};
// Sign the test result
var signedResult = await Context.DigitalSignatureService.SignDataAsync(
testResult,
Context.ModuleCertificate, // Use module's certificate
SignatureAlgorithm.RSA_SHA256,
new SignatureOptions
{
IncludeTimestamp = true,
IncludeCertificateChain = true,
TsaUrl = "https://timestamp.digicert.com"
}
);
// Log the signing operation
Logger.LogInformation("Test result {TestId} signed with certificate {Thumbprint}",
testResult.Id, Context.ModuleCertificate.Thumbprint);
// Store signed result
await StoreSignedResultAsync(signedResult);
return new SignedTestResult
{
TestResult = testResult,
Signature = signedResult.Signature,
Certificate = signedResult.SignerCertificate,
Timestamp = signedResult.TimeStamp
};
}
// Sign with user's personal certificate
public async Task ApproveWithSignatureAsync(
string documentId,
ApprovalDecision decision)
{
// Prompt user to select their signing certificate
var certificate = await Context.UI.SelectCertificateAsync(
"Select your signing certificate",
StoreName.My,
StoreLocation.CurrentUser,
X509FindType.FindByKeyUsage,
X509KeyUsageFlags.DigitalSignature
);
if (certificate == null)
{
throw new OperationCanceledException("Certificate selection cancelled");
}
var approval = new Approval
{
DocumentId = documentId,
Decision = decision,
ApprovedBy = Context.CurrentUser.UserId,
ApprovedAt = DateTime.UtcNow,
Comments = decision.Comments
};
// Sign with user's certificate
var signedApproval = await Context.DigitalSignatureService.SignDataAsync(
approval,
certificate,
SignatureAlgorithm.RSA_SHA256
);
return signedApproval;
}
}
from nexus.sdk import ModuleBase, SignatureAlgorithm, SignatureOptions
from datetime import datetime
import uuid
class TestResultModule(ModuleBase):
async def generate_signed_test_result(
self,
product_id: str,
test_data: TestData) -> SignedTestResult:
# Perform the test
result = await self.execute_test(product_id, test_data)
# Create test result object
test_result = TestResult(
id=str(uuid.uuid4()),
product_id=product_id,
test_date=datetime.utcnow(),
test_type=test_data.test_type,
results=result,
pass_fail="PASS" if result.meets_specification else "FAIL",
tested_by=self.context.current_user.username,
module_id=self.module_info.id,
module_version=self.module_info.version
)
# Sign the test result
signed_result = await self.context.digital_signature_service.sign_data(
test_result,
self.context.module_certificate, # Use module's certificate
SignatureAlgorithm.RSA_SHA256,
SignatureOptions(
include_timestamp=True,
include_certificate_chain=True,
tsa_url="https://timestamp.digicert.com"
)
)
# Log the signing operation
self.logger.info(
f"Test result {test_result.id} signed with "
f"certificate {self.context.module_certificate.thumbprint}"
)
# Store signed result
await self.store_signed_result(signed_result)
return SignedTestResult(
test_result=test_result,
signature=signed_result.signature,
certificate=signed_result.signer_certificate,
timestamp=signed_result.timestamp
)
Batch Signing
Sign multiple records efficiently with batch operations:
public async Task>> SignBatchResultsAsync(
List results)
{
var signedResults = new List>();
// Use a single signing session for efficiency
using (var signingSession = await Context.DigitalSignatureService
.CreateSigningSessionAsync(Context.ModuleCertificate))
{
foreach (var result in results)
{
var signed = await signingSession.SignDataAsync(result);
signedResults.Add(signed);
// Update progress
await Context.UI.UpdateProgressAsync(
$"Signing results... {signedResults.Count}/{results.Count}");
}
}
// Create a manifest of all signed results
var manifest = new BatchSignatureManifest
{
BatchId = Guid.NewGuid(),
SignedAt = DateTime.UtcNow,
TotalRecords = signedResults.Count,
SignatureHashes = signedResults.Select(s => s.Signature).ToList()
};
// Sign the manifest itself
var signedManifest = await Context.DigitalSignatureService.SignDataAsync(
manifest,
Context.ModuleCertificate
);
return signedResults;
}
Verifying Signatures
Signature Verification
Verify signatures to ensure data hasn't been tampered with:
public async Task ProcessSignedDataAsync(SignedData signedData)
{
// Verify the signature
var verificationResult = await Context.DigitalSignatureService
.VerifySignatureAsync(signedData);
if (!verificationResult.IsValid)
{
Logger.LogWarning("Invalid signature detected: {Errors}",
string.Join(", ", verificationResult.ValidationErrors));
// Audit the verification failure
await Context.AuditService.RecordOperationAsync(
Context.CurrentUser.UserId,
"Signature.VerificationFailed",
new
{
DataType = typeof(TestResult).Name,
Errors = verificationResult.ValidationErrors,
SignerCertificate = verificationResult.SignerCertificateSubject
},
severity: AuditSeverity.Security
);
return false;
}
// Check certificate validity
if (!verificationResult.IsCertificateValid)
{
Logger.LogWarning("Certificate validation failed for signer: {Subject}",
verificationResult.SignerCertificateSubject);
return false;
}
// Verify timestamp if present
if (signedData.TimeStamp != null && !verificationResult.IsTimeStampValid)
{
Logger.LogWarning("Timestamp validation failed");
return false;
}
// Process the verified data
Logger.LogInformation("Successfully verified signature from {Signer} at {Time}",
verificationResult.SignerName,
verificationResult.SignedAt);
await ProcessVerifiedDataAsync(signedData.Data);
return true;
}
Chain of Custody
Maintain a cryptographic chain of custody for critical data:
public async Task TransferCustodyAsync(
string dataId,
string recipientId,
string reason)
{
// Get current chain
var currentChain = await LoadChainOfCustodyAsync(dataId);
// Create new link in the chain
var newChain = await Context.DigitalSignatureService.CreateChainOfCustodyAsync(
data: new CustodyTransfer
{
DataId = dataId,
FromUserId = Context.CurrentUser.UserId,
ToUserId = recipientId,
Reason = reason,
TransferTime = DateTime.UtcNow
},
userId: Context.CurrentUser.UserId,
operation: "CustodyTransfer",
previousChain: currentChain
);
// Verify the entire chain
var chainVerification = await Context.DigitalSignatureService
.VerifyChainOfCustodyAsync(newChain);
if (!chainVerification.IsValid)
{
throw new SecurityException("Chain of custody verification failed");
}
// Store the new chain
await StoreChainOfCustodyAsync(newChain);
Logger.LogInformation("Custody transferred from {From} to {To} for data {DataId}",
Context.CurrentUser.UserId, recipientId, dataId);
return newChain;
}
Certificate Management
Working with Certificates
Access and manage certificates for signing operations:
public class CertificateManagementModule : ModuleBase
{
public async Task ConfigureSigningCertificateAsync()
{
// List available certificates
var certificates = await Context.CertificateStore
.FindCertificatesAsync(
StoreName.My,
StoreLocation.CurrentUser,
X509FindType.FindByKeyUsage,
X509KeyUsageFlags.DigitalSignature
);
// Let user select certificate
var selected = await Context.UI.SelectFromListAsync(
"Select Signing Certificate",
certificates.Select(c => new
{
Certificate = c,
Display = $"{c.Subject} (Expires: {c.NotAfter:yyyy-MM-dd})"
}).ToList(),
c => c.Display
);
if (selected != null)
{
// Validate certificate
var validation = await Context.CertificateValidator
.ValidateCertificateAsync(selected.Certificate);
if (!validation.IsValid)
{
throw new SecurityException(
$"Certificate validation failed: {string.Join(", ", validation.ValidationErrors)}");
}
// Store certificate reference
await Context.Configuration.SetAsync(
"SigningCertificate.Thumbprint",
selected.Certificate.Thumbprint
);
Logger.LogInformation("Signing certificate configured: {Subject}",
selected.Certificate.Subject);
}
}
// Monitor certificate expiration
protected override async Task OnInitializeAsync()
{
var thumbprint = Context.Configuration.GetValue("SigningCertificate.Thumbprint");
if (!string.IsNullOrEmpty(thumbprint))
{
var cert = await Context.CertificateStore.FindByThumbprintAsync(thumbprint);
if (cert != null)
{
var daysToExpiry = (cert.NotAfter - DateTime.Now).TotalDays;
if (daysToExpiry < 30)
{
Logger.LogWarning("Signing certificate expires in {Days} days", daysToExpiry);
await Context.Notifications.SendAsync(
"Certificate Expiration Warning",
$"Your signing certificate expires on {cert.NotAfter:yyyy-MM-dd}"
);
}
}
}
}
}
Digital Signature Best Practices
- Always include timestamps from trusted TSA for long-term validity
- Store certificates in secure hardware (HSM) when possible
- Monitor certificate expiration and renew before expiry
- Use appropriate key lengths (RSA 2048+ or ECDSA P-256+)
- Archive signed data with signatures for long-term verification
Module Signing
NEXUS-1 supports code signing for modules to ensure authenticity and prevent tampering. In production environments, only properly signed modules from trusted sources can be loaded.
Code Signing Overview
Why Sign Modules?
- Security - Prevents loading of malicious or tampered modules
- Authenticity - Verifies module comes from trusted developer
- Integrity - Ensures module hasn't been modified since signing
- Compliance - Many regulations require code signing
- Trust - Users can verify module publisher
Signing Requirements
For production deployments:
- Modules must be signed with a valid code signing certificate
- Certificate must be issued by a trusted Certificate Authority
- Certificate must have Code Signing key usage
- Timestamp should be included for long-term validity
Signing Your Module
Obtaining a Code Signing Certificate
You'll need a code signing certificate from a trusted CA:
- Purchase from a Certificate Authority (DigiCert, Sectigo, etc.)
- Or use your organization's internal CA for private deployments
- Store certificate securely (HSM recommended for production)
Signing Process
# Using SignTool (Windows SDK)
# Sign a .NET module DLL
signtool sign /fd SHA256 /tr http://timestamp.digicert.com /td SHA256 ^
/f "MyCertificate.pfx" /p "CertificatePassword" ^
/d "My NEXUS Module" ^
"bin\Release\MyModule.dll"
# Sign with certificate from store
signtool sign /fd SHA256 /tr http://timestamp.digicert.com /td SHA256 ^
/n "My Company Name" /sm ^
/d "My NEXUS Module" ^
"bin\Release\MyModule.dll"
# Sign with EV certificate on hardware token
signtool sign /fd SHA256 /tr http://timestamp.digicert.com /td SHA256 ^
/sha1 "0123456789ABCDEF0123456789ABCDEF01234567" ^
/d "My NEXUS Module" ^
"bin\Release\MyModule.dll"
# Verify signature
signtool verify /pa /v "bin\Release\MyModule.dll"
# Using osslsigncode for .NET assemblies on Linux
# Sign a module DLL
osslsigncode sign \
-pkcs12 MyCertificate.pfx \
-pass CertificatePassword \
-n "My NEXUS Module" \
-t http://timestamp.digicert.com \
-in bin/Release/MyModule.dll \
-out bin/Release/MyModule.signed.dll
# Move signed file to original name
mv bin/Release/MyModule.signed.dll bin/Release/MyModule.dll
# Using .NET CLI with SignTool task
dotnet msbuild MyModule.csproj /t:SignFile \
/p:CertificateThumbprint=0123456789ABCDEF \
/p:TimestampUrl=http://timestamp.digicert.com
# Verify signature
osslsigncode verify bin/Release/MyModule.dll
# GitHub Actions example
name: Build and Sign Module
on: [push, pull_request]
jobs:
build-and-sign:
runs-on: windows-latest
steps:
- uses: actions/checkout@v3
- name: Setup .NET
uses: actions/setup-dotnet@v3
with:
dotnet-version: 9.0.x
- name: Build Module
run: dotnet build -c Release
- name: Import Code Signing Certificate
env:
CERTIFICATE_BASE64: ${{ secrets.CODE_SIGNING_CERT }}
CERTIFICATE_PASSWORD: ${{ secrets.CODE_SIGNING_CERT_PASSWORD }}
run: |
$cert = [Convert]::FromBase64String($env:CERTIFICATE_BASE64)
Set-Content -Path certificate.pfx -Value $cert -Encoding Byte
- name: Sign Module
run: |
& "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\x64\signtool.exe" `
sign /fd SHA256 /tr http://timestamp.digicert.com /td SHA256 `
/f certificate.pfx /p $env:CERTIFICATE_PASSWORD `
/d "My NEXUS Module" `
"bin\Release\net9.0\MyModule.dll"
- name: Verify Signature
run: |
& "C:\Program Files (x86)\Windows Kits\10\bin\10.0.22621.0\x64\signtool.exe" `
verify /pa /v "bin\Release\net9.0\MyModule.dll"
- name: Cleanup Certificate
if: always()
run: Remove-Item -Path certificate.pfx -Force
MSBuild Integration
Automate signing in your project file:
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net9.0</TargetFramework>
<SignAssembly>true</SignAssembly>
</PropertyGroup>
<!-- Code signing configuration -->
<PropertyGroup Condition="'$(Configuration)' == 'Release'">
<SignTool>signtool</SignTool>
<SigningCertificate>$(SIGNING_CERTIFICATE_THUMBPRINT)</SigningCertificate>
<TimestampUrl>http://timestamp.digicert.com</TimestampUrl>
</PropertyGroup>
<!-- Sign after build -->
<Target Name="SignModule" AfterTargets="Build" Condition="'$(Configuration)' == 'Release'">
<Exec Command="$(SignTool) sign /fd SHA256 /tr $(TimestampUrl) /td SHA256 /sha1 $(SigningCertificate) /sm /d "$(AssemblyName)" "$(TargetPath)"" />
<Message Text="Module signed successfully" Importance="high" />
</Target>
</Project>
Module Signature Verification
How NEXUS-1 Verifies Modules
When loading a module, NEXUS-1 performs these checks:
- Verifies the digital signature is valid
- Checks certificate chain to trusted root
- Validates certificate hasn't expired
- Checks certificate revocation status
- Verifies timestamp (if present)
- Ensures certificate has Code Signing usage
Module Manifest Configuration
Specify signature requirements in your manifest:
modules:
- id: "critical-control-module"
name: "Critical Control Module"
language: "csharp"
assembly: "modules/CriticalControl.dll"
# Signature requirements
signature:
required: true
thumbprint: "0123456789ABCDEF0123456789ABCDEF01234567" # Optional: specific cert
issuer: "CN=My Company Code Signing CA" # Optional: issuer requirement
# Module will only load if signature validation passes
Testing Unsigned Modules
For development and testing, you can disable signature verification:
⚠️ Development Only
Never disable signature verification in production!
# Development configuration
security:
certificates:
requireSignedModules: false # DEVELOPMENT ONLY!
# Or via environment variable
NEXUS_SECURITY__CERTIFICATES__REQUIRESIGNEDMODULES=false dotnet Nexus.Host.dll
Troubleshooting Signature Issues
Common Issues and Solutions
Module fails to load with "Invalid signature"
Causes:
- Module was modified after signing
- Signature corrupted during transfer
- Wrong signature algorithm used
Solution: Re-sign the module and verify locally before deployment
Certificate validation fails
Causes:
- Certificate not from trusted CA
- Certificate expired
- Missing intermediate certificates
Solution: Ensure full certificate chain is included when signing
Revocation check fails
Causes:
- No internet connection for OCSP/CRL check
- Certificate revoked
- OCSP responder down
Solution: Configure offline revocation checking or proxy settings
Verification Tools
Use NEXUS-1 tools to verify module signatures:
# Verify module signature using NEXUS CLI
nexus verify-module MyModule.dll
# Output:
Module: MyModule.dll
Signature: Valid
Signer: My Company, Inc.
Certificate: CN=My Company Code Signing, O=My Company, Inc.
Issuer: CN=DigiCert SHA2 Assured ID Code Signing CA
Valid From: 2024-01-01 00:00:00 UTC
Valid To: 2027-01-01 23:59:59 UTC
Timestamp: 2024-06-15 10:30:45 UTC (DigiCert Timestamp 2023)
Hash Algorithm: SHA256
File Hash: 3B4C5D6E7F8A9B0C1D2E3F4A5B6C7D8E9F0A1B2C3D4E5F6A7B8C9D0E1F2A3B4C
✓ Module signature is valid and can be loaded by NEXUS-1
Best Practices for Module Signing
- Always timestamp your signatures for long-term validity
- Use SHA256 or stronger hash algorithms
- Store signing certificates in hardware security modules (HSM)
- Implement signing as part of your CI/CD pipeline
- Keep a backup of your signing certificates (securely!)
- Monitor certificate expiration dates
- Test signed modules in a staging environment first
Module Licensing
Build commercial modules with flexible licensing capabilities using the NEXUS-1 Licensing SDK. This enables you to monetize your modules while providing customers with various licensing models and feature tiers.
Important: NEXUS-1 Core is Free
NEXUS-1 itself is free to use. The licensing capabilities described here are for YOUR commercial modules built on top of NEXUS-1.
Quick Start - Licensed Module
using Nexus.Licensing;
[Module("your-product", "1.0.0")]
public class YourCommercialModule : LicensedModuleBase
{
// Define your product ID for licensing
protected override string ProductId => "YOUR_PRODUCT_ID";
protected override async Task OnLicensedInitializeAsync()
{
if (!IsLicensed)
{
Logger.LogWarning("Running in demo mode with limited features");
}
Logger.LogInformation($"License Status: {LicenseTier}");
}
public async Task ProcessData()
{
// Basic features (always available)
var result = BasicProcessing();
// Premium features (require license)
if (await IsFeatureEnabledAsync("ADVANCED_ANALYTICS"))
{
result = await AdvancedProcessing(result);
}
// Enterprise features (require specific tier)
if (LicenseTier >= LicenseTier.Enterprise)
{
await EnterpriseFeatures(result);
}
// Track usage for metered licensing
await TrackUsageAsync("data_processed", 1);
}
}
from nexus_sdk import LicensedModuleBase, LicenseTier
class YourCommercialModule(LicensedModuleBase):
"""Commercial module with licensing support"""
@property
def product_id(self) -> str:
return "YOUR_PRODUCT_ID"
async def on_licensed_initialize(self):
if not self.is_licensed:
self.logger.warning("Running in demo mode with limited features")
self.logger.info(f"License Status: {self.license_tier}")
async def process_data(self):
# Basic features (always available)
result = self.basic_processing()
# Premium features (require license)
if await self.is_feature_enabled("ADVANCED_ANALYTICS"):
result = await self.advanced_processing(result)
# Enterprise features (require specific tier)
if self.license_tier >= LicenseTier.ENTERPRISE:
await self.enterprise_features(result)
# Track usage for metered licensing
await self.track_usage("data_processed", 1)
#include
class YourCommercialModule : public nexus::LicensedModuleBase {
protected:
std::string GetProductId() const override {
return "YOUR_PRODUCT_ID";
}
async Task OnLicensedInitializeAsync() override {
if (!IsLicensed()) {
Logger()->Warn("Running in demo mode with limited features");
}
Logger()->Info("License Status: {}", GetLicenseTier());
}
public:
async Task ProcessData() {
// Basic features (always available)
auto result = BasicProcessing();
// Premium features (require license)
if (co_await IsFeatureEnabledAsync("ADVANCED_ANALYTICS")) {
result = co_await AdvancedProcessing(result);
}
// Enterprise features (require specific tier)
if (GetLicenseTier() >= LicenseTier::Enterprise) {
co_await EnterpriseFeatures(result);
}
// Track usage for metered licensing
co_await TrackUsageAsync("data_processed", 1);
}
};
classdef YourCommercialModule < nexus.LicensedModuleBase
properties (Constant)
ProductId = 'YOUR_PRODUCT_ID'
end
methods
function onLicensedInitialize(obj)
if ~obj.IsLicensed
obj.Logger.warning('Running in demo mode with limited features');
end
obj.Logger.info(['License Status: ' char(obj.LicenseTier)]);
end
function processData(obj)
% Basic features (always available)
result = obj.basicProcessing();
% Premium features (require license)
if obj.isFeatureEnabled('ADVANCED_ANALYTICS')
result = obj.advancedProcessing(result);
end
% Enterprise features (require specific tier)
if obj.LicenseTier >= nexus.LicenseTier.Enterprise
obj.enterpriseFeatures(result);
end
% Track usage for metered licensing
obj.trackUsage('data_processed', 1);
end
end
end
// LabVIEW Implementation: YourCommercialModule.lvclass
// Inherits from: Nexus.LicensedModuleBase.lvclass
// ProductId Property VI:
// - Return constant string: "YOUR_PRODUCT_ID"
// OnLicensedInitialize.vi:
// 1. Check IsLicensed property
// 2. If FALSE:
// - Log warning: "Running in demo mode"
// 3. Get LicenseTier property
// 4. Log info with tier status
// ProcessData.vi:
// 1. Basic Processing (always runs)
// 2. Check Feature "ADVANCED_ANALYTICS":
// - If enabled: Run Advanced Processing
// 3. Check LicenseTier >= Enterprise:
// - If true: Run Enterprise Features
// 4. Track Usage:
// - Metric: "data_processed"
// - Value: 1
Licensing Architecture
How It Works
The licensing system consists of three main components:
- Licensing Module - Core infrastructure that validates licenses
- Your Licensed Module - Inherits from LicensedModuleBase
- License Service - Local files or cloud API for validation
Deployment Configuration
# Customer's manifest.yaml
modules:
# 1. Deploy the licensing infrastructure
- id: licensing-service
module: "@nexus/licensing"
config:
service_type: cloud # or 'local' for offline
api_url: https://api.yourcompany.com/licensing
# 2. Deploy your commercial module
- id: your-product
module: "@yourcompany/product"
requires: [licensing-service]
config:
license_key: "CUSTOMER-LICENSE-KEY-HERE"
License Types and Tiers
License Types
Type | Description | Use Case |
---|---|---|
Trial | Time-limited evaluation | 30-day trials, POCs |
Subscription | Recurring payment model | Monthly/annual licenses |
Perpetual | One-time purchase | Traditional licensing |
Usage | Pay per use | API calls, data volume |
Developer | Development use only | Non-production systems |
Enterprise | Site-wide deployment | Large organizations |
License Tiers
Tier | Typical Features | Target Audience |
---|---|---|
Community | Basic features only | Free users, evaluation |
Standard | Core professional features | Small teams |
Professional | Advanced features, priority support | Professional teams |
Enterprise | All features, SLA, custom support | Large organizations |
Ultimate | Custom features, white-label | Strategic partners |
Feature Gating
Controlling Feature Access
// Method 1: Check individual features
if (await IsFeatureEnabledAsync("REAL_TIME_ANALYTICS"))
{
EnableRealTimeProcessing();
}
// Method 2: Require feature or throw
await RequireFeatureAsync("ML_PREDICTIONS");
// Throws if feature not available
// Method 3: Check license tier
if (LicenseTier >= LicenseTier.Professional)
{
EnableProfessionalFeatures();
}
// Method 4: Require minimum tier
RequireTier(LicenseTier.Enterprise);
// Throws if tier requirement not met
// Method 5: Graceful degradation
var features = new List();
if (await IsFeatureEnabledAsync("FEATURE_A")) features.Add("A");
if (await IsFeatureEnabledAsync("FEATURE_B")) features.Add("B");
if (await IsFeatureEnabledAsync("FEATURE_C")) features.Add("C");
Logger.LogInfo($"Enabled features: {string.Join(", ", features)}");
# Method 1: Check individual features
if await self.is_feature_enabled("REAL_TIME_ANALYTICS"):
self.enable_real_time_processing()
# Method 2: Require feature or throw
await self.require_feature("ML_PREDICTIONS")
# Raises exception if feature not available
# Method 3: Check license tier
if self.license_tier >= LicenseTier.PROFESSIONAL:
self.enable_professional_features()
# Method 4: Require minimum tier
self.require_tier(LicenseTier.ENTERPRISE)
# Raises exception if tier requirement not met
# Method 5: Graceful degradation
features = []
if await self.is_feature_enabled("FEATURE_A"): features.append("A")
if await self.is_feature_enabled("FEATURE_B"): features.append("B")
if await self.is_feature_enabled("FEATURE_C"): features.append("C")
self.logger.info(f"Enabled features: {', '.join(features)}")
Feature Definition Example
// Define your product's features
public static class ProductFeatures
{
// Basic tier features
public const string BASIC_PROCESSING = "BASIC_PROCESSING";
public const string DATA_IMPORT = "DATA_IMPORT";
public const string STANDARD_REPORTS = "STANDARD_REPORTS";
// Professional tier features
public const string ADVANCED_ANALYTICS = "ADVANCED_ANALYTICS";
public const string CUSTOM_DASHBOARDS = "CUSTOM_DASHBOARDS";
public const string API_ACCESS = "API_ACCESS";
public const string PRIORITY_SUPPORT = "PRIORITY_SUPPORT";
// Enterprise tier features
public const string ML_PREDICTIONS = "ML_PREDICTIONS";
public const string WHITE_LABEL = "WHITE_LABEL";
public const string UNLIMITED_USERS = "UNLIMITED_USERS";
public const string CUSTOM_INTEGRATIONS = "CUSTOM_INTEGRATIONS";
}
Usage Tracking
Implementing Usage-Based Licensing
// Track different usage metrics
public async Task ProcessApiRequest(Request request)
{
// Track API calls
var result = await TrackUsageAsync("api_calls", 1);
if (!result.IsWithinLimit)
{
throw new UsageLimitException(
$"API call limit reached: {result.CurrentUsage}/{result.UsageLimit}");
}
// Process the request
var response = await ProcessRequest(request);
// Track data volume
var dataSize = response.DataSize / 1_000_000; // MB
await TrackUsageAsync("data_volume_mb", dataSize);
// Track compute time
var computeTime = response.ProcessingTime.TotalHours;
await TrackUsageAsync("compute_hours", computeTime,
new Dictionary
{
["operation"] = request.Operation,
["complexity"] = request.Complexity
});
}
// Handle usage limit exceeded
protected override async Task OnUsageLimitExceededAsync(
string metricName, double currentUsage, double limit)
{
Logger.LogWarning($"Usage limit hit: {metricName}");
// Notify user
await Messages.PublishAsync("usage.limit.exceeded", new
{
Metric = metricName,
Current = currentUsage,
Limit = limit,
Message = "Please upgrade your license for higher limits"
});
// Optional: Degrade gracefully
if (metricName == "api_calls")
{
EnableRateLimiting();
}
}
# Track different usage metrics
async def process_api_request(self, request):
# Track API calls
result = await self.track_usage("api_calls", 1)
if not result.is_within_limit:
raise UsageLimitException(
f"API call limit reached: {result.current_usage}/{result.usage_limit}")
# Process the request
response = await self.process_request(request)
# Track data volume
data_size = response.data_size / 1_000_000 # MB
await self.track_usage("data_volume_mb", data_size)
# Track compute time
compute_time = response.processing_time.total_seconds() / 3600
await self.track_usage("compute_hours", compute_time,
context={
"operation": request.operation,
"complexity": request.complexity
})
# Handle usage limit exceeded
async def on_usage_limit_exceeded(self, metric_name, current_usage, limit):
self.logger.warning(f"Usage limit hit: {metric_name}")
# Notify user
await self.publish("usage.limit.exceeded", {
"metric": metric_name,
"current": current_usage,
"limit": limit,
"message": "Please upgrade your license for higher limits"
})
# Optional: Degrade gracefully
if metric_name == "api_calls":
self.enable_rate_limiting()
Common Usage Metrics
Metric | Unit | Example Limits |
---|---|---|
api_calls | Count per month | 10K, 100K, 1M, Unlimited |
data_points | Count per month | 100K, 10M, 100M, Unlimited |
data_volume_gb | GB per month | 10, 100, 1000, Unlimited |
compute_hours | Hours per month | 100, 1000, 10000, Unlimited |
concurrent_users | Simultaneous users | 5, 25, 100, Unlimited |
devices | Connected devices | 10, 100, 1000, Unlimited |
License Validation
Local License Files
For offline deployments, use local license files:
{
"ProductId": "YOUR_PRODUCT",
"ProductName": "Your Product Name",
"LicenseKey": "PROD-XXXX-XXXX-XXXX",
"LicenseHolder": "Customer Name",
"Company": "Customer Company",
"IssueDate": "2024-01-01T00:00:00Z",
"ExpirationDate": "2025-01-01T00:00:00Z",
"Type": "Subscription",
"Tier": "Professional",
"Features": [
"ADVANCED_ANALYTICS",
"API_ACCESS",
"PRIORITY_SUPPORT"
],
"UsageLimits": {
"api_calls": 100000,
"data_points": 10000000,
"concurrent_users": 25
}
}
Cloud License Validation
For online deployments, implement these REST endpoints:
# Your licensing API endpoints
POST /v1/licenses/validate # Validate license key
POST /v1/usage/track # Track usage metrics
GET /v1/usage/statistics # Get usage stats
GET /v1/features/check # Check feature availability
GET /v1/licenses/metadata # Get license details
# Example validation request
POST /v1/licenses/validate
{
"ProductId": "YOUR_PRODUCT",
"LicenseKey": "CUST-LICENSE-KEY",
"ModuleId": "module-instance-id",
"Version": "1.0.0",
"EnvironmentInfo": {
"hostname": "customer-server",
"os": "Windows Server 2022"
}
}
# Example response
{
"IsValid": true,
"Type": "Subscription",
"Tier": "Professional",
"ExpirationDate": "2025-01-01T00:00:00Z",
"EnabledFeatures": ["FEATURE_A", "FEATURE_B"],
"Message": "License is valid"
}
Best Practices
Module Development
DO:
- Always provide basic functionality without a license
- Fail gracefully when license limits are reached
- Cache license validation results (5-minute TTL)
- Log license status and feature availability
- Provide clear upgrade messages
- Support both online and offline licensing
DON'T:
- Hard-code license checks that prevent basic operation
- Make frequent license validation calls (use caching)
- Expose license keys in logs or error messages
- Implement license validation in client-side code only
- Block the module entirely if licensing service is down
License Design Patterns
// Pattern 1: Freemium Model
protected override List GetCommunityFeatures()
{
return new List
{
"BASIC_PROCESSING",
"IMPORT_EXPORT",
"STANDARD_REPORTS"
};
}
// Pattern 2: Feature Bundles
public static class FeatureBundles
{
public static readonly Dictionary> TierFeatures = new()
{
[LicenseTier.Standard] = new() { "BUNDLE_STANDARD", "SUPPORT_EMAIL" },
[LicenseTier.Professional] = new() { "BUNDLE_PRO", "SUPPORT_PRIORITY", "API_ACCESS" },
[LicenseTier.Enterprise] = new() { "BUNDLE_ENTERPRISE", "SUPPORT_24X7", "WHITE_LABEL" }
};
}
// Pattern 3: Gradual Degradation
public async Task ProcessDataWithDegradation(Data input)
{
try
{
// Try premium processing
if (await IsFeatureEnabledAsync("PREMIUM_PROCESSING"))
{
return await PremiumProcessing(input);
}
}
catch (Exception ex)
{
Logger.LogWarning(ex, "Premium processing failed, falling back");
}
// Fall back to standard processing
try
{
if (await IsFeatureEnabledAsync("STANDARD_PROCESSING"))
{
return await StandardProcessing(input);
}
}
catch (Exception ex)
{
Logger.LogWarning(ex, "Standard processing failed, falling back");
}
// Always provide basic processing
return await BasicProcessing(input);
}
Revenue Models
Common Pricing Strategies
Model | Example Tiers | Best For |
---|---|---|
Subscription | $99/mo, $299/mo, $999/mo | SaaS products, ongoing value |
Perpetual + Maintenance | $5000 + 20%/year | Enterprise software |
Usage-Based | $0.001/API call, $0.10/GB | Infrastructure, APIs |
Hybrid | $199/mo + $0.01/transaction | Platforms with variable usage |
Freemium | Free → $49 → $199 → Custom | Developer tools, broad market |
Implementation Example
// Define your pricing tiers
public class PricingTiers
{
public static readonly Dictionary Tiers = new()
{
[LicenseTier.Community] = new TierDefinition
{
Name = "Community",
Price = 0,
Features = new[] { "BASIC_FEATURES" },
Limits = new() { ["api_calls"] = 1000, ["users"] = 3 }
},
[LicenseTier.Standard] = new TierDefinition
{
Name = "Standard",
Price = 299,
Features = new[] { "BASIC_FEATURES", "ADVANCED_ANALYTICS", "EMAIL_SUPPORT" },
Limits = new() { ["api_calls"] = 50000, ["users"] = 10 }
},
[LicenseTier.Professional] = new TierDefinition
{
Name = "Professional",
Price = 999,
Features = new[] { "ALL_STANDARD", "ML_PREDICTIONS", "API_ACCESS", "PRIORITY_SUPPORT" },
Limits = new() { ["api_calls"] = 500000, ["users"] = 50 }
},
[LicenseTier.Enterprise] = new TierDefinition
{
Name = "Enterprise",
Price = -1, // Custom pricing
Features = new[] { "ALL_FEATURES", "WHITE_LABEL", "CUSTOM_INTEGRATION", "SLA" },
Limits = new() { ["api_calls"] = -1, ["users"] = -1 } // Unlimited
}
};
}
Next Steps
Ready to add licensing to your module?
- Inherit from
LicensedModuleBase
- Define your product ID and features
- Implement feature gating in your code
- Set up license validation (local or cloud)
- Define your pricing tiers
- Test with different license scenarios
See modules/examples/IndustrialAnalytics/
for a complete example.
Remote Modules
Develop and integrate modules that run on remote systems, edge devices, or cloud platforms while maintaining seamless communication with the NEXUS-1 message bus.
Remote Module Architecture
Overview
Remote modules extend NEXUS-1 capabilities beyond the local system, enabling distributed architectures for scalability, geographic distribution, and integration with external systems.
Remote vs Local Modules
Aspect | Local Modules | Remote Modules |
---|---|---|
Deployment | Same host as NEXUS-1 | Separate host/network/cloud |
Communication | In-process or IPC | Network protocols (gRPC, REST, WebSocket) |
Latency | < 1ms typical | 1-100ms+ depending on network |
Reliability | Process-level | Network reliability required |
Security | Process isolation | TLS, authentication, firewall |
Scalability | Limited by host | Horizontal scaling possible |
How Remote Modules Work
Launch Process Overview
Important: No Local NEXUS Host Required
Remote modules do NOT require a local NEXUS-1 installation. They are independent services that connect to the central NEXUS-1 kernel via network protocols (gRPC, REST, WebSocket). This allows deployment on any system with network connectivity to the NEXUS-1 host.
Two Deployment Models
1. Pull Model (Self-Connecting)
Remote modules launch independently and connect to NEXUS-1:
- Module starts as a standalone service/application
- Connects to NEXUS-1's gRPC/REST endpoint
- Registers itself and its capabilities
- Maintains its own lifecycle
Use when: Module runs on edge devices, cloud services, or partner systems
2. Push Model (NEXUS-Managed)
NEXUS-1 launches remote modules via deployment agents:
- NEXUS-1 triggers module start via SSH, Docker API, or K8s
- Module receives connection details on startup
- NEXUS-1 manages the module lifecycle
- Automatic restart on failure
Use when: You want centralized control over module lifecycle
Module Discovery and Registration
Remote modules are discovered through:
- Manifest Configuration: Define remote endpoints in nexus-manifest.yaml
- Dynamic Registration: Modules connect and register themselves
- Service Discovery: Integration with Consul, etcd, or Kubernetes
Remote Module Configuration
Manifest Configuration
Configure remote modules in your nexus-manifest.yaml:
# nexus-manifest.yaml
modules:
# Pull Model - Module connects to NEXUS
- name: RemoteDataProcessor
type: remote
enabled: true
connection:
type: grpc
endpoint: grpc://data-processor.internal:50051
tls:
enabled: true
ca_cert: /certs/ca.crt
verify_hostname: true
health_check:
interval: 30s
timeout: 5s
capabilities:
- data.process
- data.transform
# Push Model - NEXUS launches module
- name: EdgeSensorGateway
type: remote
enabled: true
deployment:
type: ssh
host: edge-device-01.local
user: nexus
key_file: /keys/edge-device.key
command: /opt/nexus-modules/sensor-gateway
args:
--nexus-url: "grpcs://nexus.example.com:8443"
--module-id: "{module.id}"
--auth-token: "{module.token}"
restart_policy:
type: always
max_attempts: 5
delay: 10s
# Docker-based remote module
- name: AnalyticsEngine
type: remote
enabled: true
deployment:
type: docker
image: mycompany/analytics-module:latest
host: tcp://docker-host.internal:2376
env:
NEXUS_URL: "https://nexus.example.com:8443"
MODULE_ID: "{module.id}"
AUTH_TOKEN: "{module.token}"
volumes:
- /data/analytics:/data
restart: unless-stopped
# Kubernetes-based remote module
- name: MLProcessor
type: remote
enabled: true
deployment:
type: kubernetes
namespace: nexus-modules
manifest: < /dev/null |
apiVersion: apps/v1
kind: Deployment
metadata:
name: ml-processor
spec:
replicas: 3
selector:
matchLabels:
app: ml-processor
template:
metadata:
labels:
app: ml-processor
spec:
containers:
- name: processor
image: mycompany/ml-processor:latest
env:
- name: NEXUS_URL
value: "grpcs://nexus.example.com:8443"
- name: MODULE_ID
valueFrom:
fieldRef:
fieldPath: metadata.name
service:
type: LoadBalancer
port: 50051
C# DLL as Remote Module
Creating a Remote Module from a C# Class Library
Transform any C# DLL into a remote NEXUS module with minimal code:
// DataProcessor.cs - Your existing business logic DLL
namespace MyCompany.DataProcessing
{
public class DataProcessor
{
private readonly ILogger _logger;
private readonly DatabaseContext _db;
public DataProcessor(ILogger logger, DatabaseContext db)
{
_logger = logger;
_db = db;
}
public async Task ProcessDataAsync(DataPacket packet)
{
// Your existing data processing logic
var processed = TransformData(packet);
await _db.ProcessedData.AddAsync(processed);
await _db.SaveChangesAsync();
_logger.LogInformation($"Processed packet {packet.Id}");
return new ProcessingResult { Success = true, RecordId = processed.Id };
}
private ProcessedData TransformData(DataPacket packet)
{
// Business logic here
return new ProcessedData
{
Timestamp = DateTime.UtcNow,
Value = packet.Value * 1.5,
Source = packet.Source
};
}
}
}
Remote Module Host Wrapper
Create a minimal host application to expose your DLL as a NEXUS module:
// Program.cs - Remote module host
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Hosting;
using Nexus.SDK.Remote;
using MyCompany.DataProcessing;
var builder = Host.CreateDefaultBuilder(args);
builder.ConfigureServices((context, services) =>
{
// Register your existing services
services.AddDbContext(options =>
options.UseSqlServer(context.Configuration.GetConnectionString("Default")));
services.AddScoped();
// Add NEXUS remote module support
services.AddNexusRemoteModule(options =>
{
options.ModuleId = "data-processor-01";
options.ModuleName = "Remote Data Processor";
options.ServerUrl = context.Configuration["Nexus:ServerUrl"]
?? "https://nexus.example.com:8443";
options.AuthToken = context.Configuration["Nexus:AuthToken"];
options.ReconnectInterval = TimeSpan.FromSeconds(30);
});
// Register the module adapter
services.AddHostedService();
});
var host = builder.Build();
await host.RunAsync();
// DataProcessorModule.cs - Adapter between your DLL and NEXUS
public class DataProcessorModule : NexusRemoteModuleBase
{
private readonly DataProcessor _processor;
private readonly ILogger _logger;
public DataProcessorModule(
DataProcessor processor,
ILogger logger,
INexusRemoteClient client) : base(client)
{
_processor = processor;
_logger = logger;
}
protected override async Task OnInitializeAsync()
{
// Register capabilities
await RegisterCapabilityAsync("data.process", "Process data packets");
// Subscribe to data processing requests
await SubscribeAsync("data.process.request", HandleProcessRequest);
// Subscribe to batch processing
await SubscribeAsync("data.batch.process", HandleBatchProcess);
_logger.LogInformation("Data processor module initialized");
}
private async Task HandleProcessRequest(RemoteMessage message)
{
try
{
var packet = message.GetPayload();
var result = await _processor.ProcessDataAsync(packet);
// Respond with result
await message.RespondAsync(result);
// Publish completion event
await PublishAsync("data.processed", new
{
PacketId = packet.Id,
RecordId = result.RecordId,
Timestamp = DateTime.UtcNow
});
}
catch (Exception ex)
{
_logger.LogError(ex, "Failed to process data packet");
await message.RespondErrorAsync(ex.Message);
}
}
private async Task HandleBatchProcess(RemoteMessage message)
{
var batch = message.GetPayload();
var results = new List();
foreach (var packet in batch)
{
try
{
var result = await _processor.ProcessDataAsync(packet);
results.Add(result);
}
catch (Exception ex)
{
_logger.LogError(ex, $"Failed to process packet {packet.Id}");
results.Add(new ProcessingResult
{
Success = false,
Error = ex.Message
});
}
}
await message.RespondAsync(results);
}
protected override async Task OnHealthCheckAsync()
{
// Custom health check
try
{
await _processor.CheckDatabaseConnectionAsync();
return HealthCheckResult.Healthy("Database connection OK");
}
catch (Exception ex)
{
return HealthCheckResult.Unhealthy($"Database error: {ex.Message}");
}
}
}
Deployment Options
1. Windows Service
# Install as Windows Service
sc create NexusDataProcessor binPath="C:\NexusModules\DataProcessor.exe"
sc config NexusDataProcessor start=auto
sc start NexusDataProcessor
2. Docker Container
# Dockerfile
FROM mcr.microsoft.com/dotnet/runtime:7.0
WORKDIR /app
COPY publish/ .
ENV ASPNETCORE_URLS=http://+:50051
ENV Nexus__ServerUrl=https://nexus.example.com:8443
ENTRYPOINT ["dotnet", "DataProcessor.dll"]
# Build and run
docker build -t data-processor .
docker run -d --name nexus-data-processor \
-e Nexus__AuthToken=$NEXUS_TOKEN \
-p 50051:50051 \
data-processor
3. .NET CLI Tool
# Package as .NET tool
dotnet pack -c Release
dotnet tool install --global MyCompany.DataProcessor
# Run anywhere
data-processor --nexus-url https://nexus.example.com:8443
Development Approaches
Standalone Services
Develop remote modules as independent services that connect to NEXUS-1.
// Standalone C# service connecting to NEXUS-1
using Nexus.SDK.Remote;
public class RemoteSensorGateway
{
private readonly NexusRemoteClient _client;
private readonly List _sensors;
public static async Task Main(string[] args)
{
var gateway = new RemoteSensorGateway();
await gateway.RunAsync();
}
public RemoteSensorGateway()
{
// Configure remote connection
var config = new NexusRemoteConfig
{
ServerUrl = "https://nexus.example.com:8443",
ClientCertificate = LoadCertificate("client.pfx"),
ModuleId = "remote-sensor-gateway",
ModuleName = "Remote Sensor Gateway",
ReconnectInterval = TimeSpan.FromSeconds(5)
};
_client = new NexusRemoteClient(config);
_sensors = DiscoverSensors();
}
public async Task RunAsync()
{
// Connect to NEXUS-1
await _client.ConnectAsync();
// Register module capabilities
await _client.RegisterCapabilitiesAsync(new[]
{
"sensor.temperature.read",
"sensor.pressure.read",
"sensor.humidity.read"
});
// Subscribe to commands
await _client.SubscribeAsync("command.sensor.*", HandleCommand);
// Start sensor monitoring
var cts = new CancellationTokenSource();
var monitorTask = MonitorSensorsAsync(cts.Token);
// Handle shutdown
Console.CancelKeyPress += (s, e) =>
{
e.Cancel = true;
cts.Cancel();
};
await monitorTask;
await _client.DisconnectAsync();
}
private async Task MonitorSensorsAsync(CancellationToken ct)
{
while (!ct.IsCancellationRequested)
{
foreach (var sensor in _sensors)
{
try
{
var reading = await sensor.ReadAsync();
// Publish to NEXUS-1
await _client.PublishAsync($"sensor.{sensor.Type}.reading", new
{
sensor_id = sensor.Id,
value = reading.Value,
unit = reading.Unit,
timestamp = DateTime.UtcNow,
location = sensor.Location
});
}
catch (Exception ex)
{
await _client.LogErrorAsync($"Sensor {sensor.Id} read failed", ex);
}
}
await Task.Delay(1000, ct);
}
}
private async Task HandleCommand(RemoteMessage message)
{
var command = message.GetPayload();
switch (command.Action)
{
case "calibrate":
await CalibrateSensor(command.SensorId);
await message.RespondAsync(new { success = true });
break;
case "reset":
await ResetSensor(command.SensorId);
await message.RespondAsync(new { success = true });
break;
default:
await message.RespondAsync(new
{
success = false,
error = $"Unknown command: {command.Action}"
});
break;
}
}
}
# Standalone Python service connecting to NEXUS-1
import asyncio
import ssl
from nexus_sdk_remote import NexusRemoteClient, RemoteConfig
class RemoteSensorGateway:
def __init__(self):
# Configure remote connection
self.config = RemoteConfig(
server_url="https://nexus.example.com:8443",
client_cert="client.pem",
client_key="client-key.pem",
ca_cert="ca.pem",
module_id="remote-sensor-gateway",
module_name="Remote Sensor Gateway",
reconnect_interval=5.0
)
self.client = NexusRemoteClient(self.config)
self.sensors = self.discover_sensors()
self.running = True
async def run(self):
"""Main entry point for the service"""
# Connect to NEXUS-1
await self.client.connect()
# Register module capabilities
await self.client.register_capabilities([
"sensor.temperature.read",
"sensor.pressure.read",
"sensor.humidity.read"
])
# Subscribe to commands
await self.client.subscribe("command.sensor.*", self.handle_command)
# Start monitoring tasks
tasks = [
asyncio.create_task(self.monitor_sensors()),
asyncio.create_task(self.maintain_connection())
]
try:
await asyncio.gather(*tasks)
except KeyboardInterrupt:
self.running = False
await self.client.disconnect()
async def monitor_sensors(self):
"""Monitor and publish sensor data"""
while self.running:
for sensor in self.sensors:
try:
reading = await sensor.read_async()
# Publish to NEXUS-1
await self.client.publish(
f"sensor.{sensor.type}.reading",
{
"sensor_id": sensor.id,
"value": reading.value,
"unit": reading.unit,
"timestamp": datetime.utcnow().isoformat(),
"location": sensor.location
}
)
except Exception as e:
await self.client.log_error(
f"Sensor {sensor.id} read failed: {e}"
)
await asyncio.sleep(1.0)
async def handle_command(self, message):
"""Handle commands from NEXUS-1"""
command = message.payload
try:
if command["action"] == "calibrate":
await self.calibrate_sensor(command["sensor_id"])
await message.respond({"success": True})
elif command["action"] == "reset":
await self.reset_sensor(command["sensor_id"])
await message.respond({"success": True})
else:
await message.respond({
"success": False,
"error": f"Unknown command: {command['action']}"
})
except Exception as e:
await message.respond({
"success": False,
"error": str(e)
})
async def maintain_connection(self):
"""Maintain connection health"""
while self.running:
if not self.client.is_connected:
try:
await self.client.reconnect()
except Exception as e:
print(f"Reconnection failed: {e}")
# Send heartbeat
await self.client.publish("heartbeat", {
"module_id": self.config.module_id,
"timestamp": datetime.utcnow().isoformat(),
"status": "healthy"
})
await asyncio.sleep(30)
if __name__ == "__main__":
gateway = RemoteSensorGateway()
asyncio.run(gateway.run())
// Standalone C++ service connecting to NEXUS-1
#include
#include
#include
class RemoteSensorGateway {
private:
nexus::RemoteClient client_;
std::vector> sensors_;
std::atomic running_{true};
public:
RemoteSensorGateway() {
// Configure remote connection
nexus::RemoteConfig config{
.server_url = "https://nexus.example.com:8443",
.client_cert_path = "client.pem",
.client_key_path = "client-key.pem",
.ca_cert_path = "ca.pem",
.module_id = "remote-sensor-gateway",
.module_name = "Remote Sensor Gateway",
.reconnect_interval = std::chrono::seconds(5)
};
client_ = nexus::RemoteClient(config);
sensors_ = discover_sensors();
}
void run() {
// Connect to NEXUS-1
client_.connect();
// Register capabilities
client_.register_capabilities({
"sensor.temperature.read",
"sensor.pressure.read",
"sensor.humidity.read"
});
// Subscribe to commands
client_.subscribe("command.sensor.*",
[this](const nexus::RemoteMessage& msg) {
handle_command(msg);
});
// Start worker threads
std::thread monitor_thread(&RemoteSensorGateway::monitor_sensors, this);
std::thread health_thread(&RemoteSensorGateway::maintain_health, this);
// Wait for shutdown signal
signal(SIGINT, [](int) { running_ = false; });
while (running_) {
std::this_thread::sleep_for(std::chrono::seconds(1));
}
// Cleanup
monitor_thread.join();
health_thread.join();
client_.disconnect();
}
private:
void monitor_sensors() {
while (running_) {
for (const auto& sensor : sensors_) {
try {
auto reading = sensor->read();
// Publish to NEXUS-1
client_.publish(
fmt::format("sensor.{}.reading", sensor->type()),
json{
{"sensor_id", sensor->id()},
{"value", reading.value},
{"unit", reading.unit},
{"timestamp", std::chrono::system_clock::now()},
{"location", sensor->location()}
}
);
} catch (const std::exception& e) {
client_.log_error(
fmt::format("Sensor {} read failed: {}",
sensor->id(), e.what())
);
}
}
std::this_thread::sleep_for(std::chrono::seconds(1));
}
}
void handle_command(const nexus::RemoteMessage& message) {
auto command = message.get_payload();
try {
if (command.action == "calibrate") {
calibrate_sensor(command.sensor_id);
message.respond(json{{"success", true}});
} else if (command.action == "reset") {
reset_sensor(command.sensor_id);
message.respond(json{{"success", true}});
} else {
message.respond(json{
{"success", false},
{"error", "Unknown command: " + command.action}
});
}
} catch (const std::exception& e) {
message.respond(json{
{"success", false},
{"error", e.what()}
});
}
}
void maintain_health() {
while (running_) {
if (!client_.is_connected()) {
try {
client_.reconnect();
} catch (const std::exception& e) {
std::cerr << "Reconnection failed: " << e.what() << std::endl;
}
}
// Send heartbeat
client_.publish("heartbeat", json{
{"module_id", client_.module_id()},
{"timestamp", std::chrono::system_clock::now()},
{"status", "healthy"}
});
std::this_thread::sleep_for(std::chrono::seconds(30));
}
}
};
// Standalone Node.js service connecting to NEXUS-1
const { NexusRemoteClient } = require('nexus-sdk-remote');
const fs = require('fs');
class RemoteSensorGateway {
constructor() {
// Configure remote connection
this.config = {
serverUrl: 'https://nexus.example.com:8443',
clientCert: fs.readFileSync('client.pem'),
clientKey: fs.readFileSync('client-key.pem'),
caCert: fs.readFileSync('ca.pem'),
moduleId: 'remote-sensor-gateway',
moduleName: 'Remote Sensor Gateway',
reconnectInterval: 5000
};
this.client = new NexusRemoteClient(this.config);
this.sensors = this.discoverSensors();
this.running = true;
}
async run() {
// Connect to NEXUS-1
await this.client.connect();
// Register capabilities
await this.client.registerCapabilities([
'sensor.temperature.read',
'sensor.pressure.read',
'sensor.humidity.read'
]);
// Subscribe to commands
await this.client.subscribe('command.sensor.*',
this.handleCommand.bind(this));
// Start monitoring
this.monitorSensors();
this.maintainConnection();
// Handle shutdown
process.on('SIGINT', async () => {
this.running = false;
await this.client.disconnect();
process.exit(0);
});
}
async monitorSensors() {
while (this.running) {
for (const sensor of this.sensors) {
try {
const reading = await sensor.read();
// Publish to NEXUS-1
await this.client.publish(
`sensor.${sensor.type}.reading`,
{
sensor_id: sensor.id,
value: reading.value,
unit: reading.unit,
timestamp: new Date().toISOString(),
location: sensor.location
}
);
} catch (error) {
await this.client.logError(
`Sensor ${sensor.id} read failed: ${error.message}`
);
}
}
await this.sleep(1000);
}
}
async handleCommand(message) {
const command = message.payload;
try {
switch (command.action) {
case 'calibrate':
await this.calibrateSensor(command.sensor_id);
await message.respond({ success: true });
break;
case 'reset':
await this.resetSensor(command.sensor_id);
await message.respond({ success: true });
break;
default:
await message.respond({
success: false,
error: `Unknown command: ${command.action}`
});
}
} catch (error) {
await message.respond({
success: false,
error: error.message
});
}
}
async maintainConnection() {
while (this.running) {
if (!this.client.isConnected) {
try {
await this.client.reconnect();
} catch (error) {
console.error('Reconnection failed:', error);
}
}
// Send heartbeat
await this.client.publish('heartbeat', {
module_id: this.config.moduleId,
timestamp: new Date().toISOString(),
status: 'healthy'
});
await this.sleep(30000);
}
}
sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
}
// Start the gateway
const gateway = new RemoteSensorGateway();
gateway.run().catch(console.error);
Containerized Modules
Deploy remote modules as Docker containers for consistent deployment and scaling.
# Dockerfile for remote module
FROM mcr.microsoft.com/dotnet/runtime:7.0 AS base
"""Initialize authorization module"""
await super().initialize(context)
# Subscribe to authorization checks
await self.message_bus.subscribe("authz/check/*", self.handle_authorization_check)
self.logger.info("Authorization module initialized")
async def handle_authorization_check(self, message: Message):
"""Handle authorization check requests"""
request = message.get_payload()
is_authorized = await self.check_authorization(
request["username"],
request["resource"],
request["action"]
)
await self.message_bus.publish(
f"authz/result/{message.correlation_id}",
{
"authorized": is_authorized,
"username": request["username"],
"resource": request["resource"],
"action": request["action"],
"timestamp": datetime.utcnow().isoformat()
}
)
async def check_authorization(self, username: str, resource: str, action: str) -> bool:
"""Check if user is authorized for action on resource"""
# Get user roles
user_roles = self.user_roles.get(username, set())
if not user_roles:
self.logger.warning(f"User {username} has no roles assigned")
return False
# Check permissions for each role
for role in user_roles:
permissions = self.role_permissions.get(role, set())
if action in permissions:
self.logger.info(f"User {username} authorized for {action} on {resource}")
return True
self.logger.warning(f"User {username} denied {action} on {resource}")
return False
def require_permission(self, permission: str):
"""Decorator for permission-based authorization"""
def decorator(func):
@wraps(func)
async def wrapper(self, *args, **kwargs):
# Get current user from context
user = self.context.current_user
# Check authorization
if not await self.check_authorization(user, "module", permission):
raise PermissionError(f"User {user} lacks permission: {permission}")
return await func(self, *args, **kwargs)
return wrapper
return decorator
# Example usage in another module
class SecureOperationsModule(ModuleBase):
def __init__(self):
super().__init__()
self.authz = AuthorizationModule()
@AuthorizationModule.require_permission("write")
async def update_configuration(self, key: str, value: any) -> bool:
"""Update configuration with authorization check"""
# This will only execute if user has write permission
await self.config.set(key, value)
self.logger.info(f"Configuration updated: {key}")
return True
@AuthorizationModule.require_permission("delete")
async def delete_resource(self, resource_id: str) -> bool:
"""Delete resource with authorization check"""
# This will only execute if user has delete permission
await self.message_bus.publish("resource/delete", {"id": resource_id})
return True
#include <nexus/module.hpp>
#include <unordered_map>
#include <unordered_set>
#include <functional>
class AuthorizationModule : public nexus::ModuleBase {
private:
using PermissionSet = std::unordered_set;
using RoleMap = std::unordered_map;
using UserRoleMap = std::unordered_map>;
RoleMap role_permissions_;
UserRoleMap user_roles_;
public:
nexus::Status Initialize(const nexus::ModuleContext& context) override {
nexus::Status status = ModuleBase::Initialize(context);
if (!status.ok()) return status;
// Initialize role permissions
role_permissions_ = {
{"admin", {"read", "write", "delete", "config"}},
{"operator", {"read", "write"}},
{"viewer", {"read"}}
};
// Initialize user roles (in production, load from secure store)
user_roles_ = {
{"alice", {"admin"}},
{"bob", {"operator"}},
{"charlie", {"viewer"}}
};
// Subscribe to authorization checks
message_bus_->Subscribe("authz/check/*",
[this](const nexus::Message& msg) {
return HandleAuthorizationCheck(msg);
});
logger_->Info("Authorization module initialized");
return nexus::Status::OK;
}
nexus::Status HandleAuthorizationCheck(const nexus::Message& message) {
try {
auto request = message.GetPayload();
bool is_authorized = CheckAuthorization(
request.username,
request.resource,
request.action
);
AuthorizationResult result{
is_authorized,
request.username,
request.resource,
request.action,
std::chrono::system_clock::now()
};
message_bus_->Publish(
"authz/result/" + message.correlation_id,
result
);
} catch (const std::exception& e) {
logger_->Error("Authorization check failed: {}", e.what());
}
return nexus::Status::OK;
}
bool CheckAuthorization(const std::string& username,
const std::string& resource,
const std::string& action) {
// Get user roles
auto user_it = user_roles_.find(username);
if (user_it == user_roles_.end()) {
logger_->Warning("User {} has no roles assigned", username);
return false;
}
// Check permissions for each role
for (const auto& role : user_it->second) {
auto role_it = role_permissions_.find(role);
if (role_it != role_permissions_.end()) {
if (role_it->second.count(action) > 0) {
logger_->Info("User {} authorized for {} on {}",
username, action, resource);
return true;
}
}
}
logger_->Warning("User {} denied {} on {}",
username, action, resource);
return false;
}
// Permission guard for method protection
template
class PermissionGuard {
private:
std::string permission_;
Func func_;
AuthorizationModule* authz_;
public:
PermissionGuard(const std::string& permission, Func func,
AuthorizationModule* authz)
: permission_(permission), func_(func), authz_(authz) {}
template
auto operator()(const std::string& user, Args&&... args)
-> decltype(func_(std::forward(args)...)) {
if (!authz_->CheckAuthorization(user, "module", permission_)) {
throw std::runtime_error("Access denied: " + permission_);
}
return func_(std::forward(args)...);
}
};
// Helper to create permission guards
template
PermissionGuard RequirePermission(const std::string& permission,
Func func) {
return PermissionGuard(permission, func, this);
}
};
// Example usage
class SecureOperationsModule : public nexus::ModuleBase {
private:
std::shared_ptr authz_;
public:
nexus::Status UpdateConfiguration(const std::string& key,
const nexus::Value& value) {
auto guarded_update = authz_->RequirePermission("write",
[this](const std::string& k, const nexus::Value& v) {
config_->Set(k, v);
logger_->Info("Configuration updated: {}", k);
return nexus::Status::OK;
});
return guarded_update(context_->current_user(), key, value);
}
};
classdef AuthorizationModule < nexus.ModuleBase
properties (Access = private)
rolePermissions
userRoles
end
methods
function obj = AuthorizationModule()
obj@nexus.ModuleBase();
% Initialize role permissions
obj.rolePermissions = containers.Map();
obj.rolePermissions('admin') = {'read', 'write', 'delete', 'config'};
obj.rolePermissions('operator') = {'read', 'write'};
obj.rolePermissions('viewer') = {'read'};
% Initialize user roles (in production, from secure store)
obj.userRoles = containers.Map();
obj.userRoles('alice') = {'admin'};
obj.userRoles('bob') = {'operator'};
obj.userRoles('charlie') = {'viewer'};
end
function initialize(obj, context)
% Initialize authorization module
initialize@nexus.ModuleBase(obj, context);
% Subscribe to authorization checks
obj.messageBus.subscribe('authz/check/*', @obj.handleAuthorizationCheck);
obj.logger.info('Authorization module initialized');
end
function handleAuthorizationCheck(obj, message)
% Handle authorization check requests
try
request = message.getPayload();
isAuthorized = obj.checkAuthorization(...
request.username, ...
request.resource, ...
request.action ...
);
result = struct(...
'authorized', isAuthorized, ...
'username', request.username, ...
'resource', request.resource, ...
'action', request.action, ...
'timestamp', datetime('now', 'TimeZone', 'UTC') ...
);
obj.messageBus.publish(...
sprintf('authz/result/%s', message.correlationId), ...
result ...
);
catch ME
obj.logger.error(['Authorization check failed: ' ME.message]);
end
end
function isAuthorized = checkAuthorization(obj, username, resource, action)
% Check if user is authorized for action on resource
isAuthorized = false;
% Get user roles
if ~obj.userRoles.isKey(username)
obj.logger.warning(sprintf('User %s has no roles assigned', username));
return;
end
userRoleList = obj.userRoles(username);
% Check permissions for each role
for i = 1:length(userRoleList)
role = userRoleList{i};
if obj.rolePermissions.isKey(role)
permissions = obj.rolePermissions(role);
if any(strcmp(permissions, action))
obj.logger.info(sprintf(...
'User %s authorized for %s on %s', ...
username, action, resource));
isAuthorized = true;
return;
end
end
end
obj.logger.warning(sprintf(...
'User %s denied %s on %s', ...
username, action, resource));
end
function enforcePermission(obj, permission, username)
% Enforce permission check - throws error if not authorized
if ~obj.checkAuthorization(username, 'module', permission)
error('AuthorizationModule:AccessDenied', ...
'User %s lacks permission: %s', username, permission);
end
end
end
end
% Example usage in another module
classdef SecureOperationsModule < nexus.ModuleBase
properties (Access = private)
authz
end
methods
function obj = SecureOperationsModule()
obj@nexus.ModuleBase();
obj.authz = AuthorizationModule();
end
function success = updateConfiguration(obj, key, value)
% Update configuration with authorization check
currentUser = obj.context.currentUser;
% Check permission
obj.authz.enforcePermission('write', currentUser);
% Perform update
obj.config.set(key, value);
obj.logger.info(sprintf('Configuration updated: %s', key));
success = true;
end
end
end
// LabVIEW Authorization Module Implementation
// File: AuthorizationModule.lvclass
// Class Private Data
// - RolePermissions: Map>
// - UserRoles: Map>
// Initialize Method
Begin Initialize
// Initialize role permissions
RolePermissions["admin"] = ["read", "write", "delete", "config"]
RolePermissions["operator"] = ["read", "write"]
RolePermissions["viewer"] = ["read"]
// Initialize user roles
UserRoles["alice"] = ["admin"]
UserRoles["bob"] = ["operator"]
UserRoles["charlie"] = ["viewer"]
// Subscribe to authorization checks
MessageBus.Subscribe("authz/check/*", HandleAuthorizationCheck.vi)
Logger.Info("Authorization module initialized")
End Initialize
// Handle Authorization Check Method
Begin HandleAuthorizationCheck
Try
// Get request data
Username = Message.Payload.Username
Resource = Message.Payload.Resource
Action = Message.Payload.Action
// Check authorization
IsAuthorized = CheckAuthorization(Username, Resource, Action)
// Create result
Result.Authorized = IsAuthorized
Result.Username = Username
Result.Resource = Resource
Result.Action = Action
Result.Timestamp = CurrentDateTime()
// Publish result
Topic = Format("authz/result/%s", Message.CorrelationId)
MessageBus.Publish(Topic, Result)
Catch Exception
Logger.Error("Authorization check failed: " + Exception.Message)
End Try
End HandleAuthorizationCheck
// Check Authorization Method
// Inputs: Username, Resource, Action (Strings)
// Outputs: IsAuthorized (Boolean)
Begin CheckAuthorization
IsAuthorized = False
// Get user roles
If Not UserRoles.ContainsKey(Username) Then
Logger.Warning(Format("User %s has no roles assigned", Username))
Return IsAuthorized
End If
UserRoleList = UserRoles[Username]
// Check each role
For Each Role In UserRoleList
If RolePermissions.ContainsKey(Role) Then
Permissions = RolePermissions[Role]
// Check if action is in permissions
For Each Permission In Permissions
If Permission == Action Then
Logger.Info(Format("User %s authorized for %s on %s",
Username, Action, Resource))
IsAuthorized = True
Return IsAuthorized
End If
End For
End If
End For
Logger.Warning(Format("User %s denied %s on %s",
Username, Action, Resource))
Return IsAuthorized
End CheckAuthorization
// Enforce Permission Method
// Inputs: Permission (String), Username (String)
// Outputs: None (Throws error if not authorized)
Begin EnforcePermission
IsAuthorized = CheckAuthorization(Username, "module", Permission)
If Not IsAuthorized Then
Error = Format("User %s lacks permission: %s", Username, Permission)
Throw AuthorizationException(Error)
End If
End EnforcePermission
// Example Secure Operations VI
// UpdateConfiguration.vi
Begin UpdateConfiguration
// Inputs: Key (String), Value (Variant)
// Outputs: Success (Boolean)
// Get current user
CurrentUser = Context.CurrentUser
// Check permission
AuthModule.EnforcePermission("write", CurrentUser)
// If we get here, user is authorized
Config.Set(Key, Value)
Logger.Info(Format("Configuration updated: %s", Key))
Success = True
End UpdateConfiguration
Performance Profiling
Profile and optimize your NEXUS-1 modules to ensure they meet performance requirements and operate efficiently in production environments.
Profiling Tools and Setup
Language-Specific Profilers
Each development language offers specialized tools for performance profiling. Configure these tools to work with your NEXUS-1 modules.
// C# Performance Profiling Setup
using System.Diagnostics;
using Microsoft.Diagnostics.Tracing;
using Microsoft.Diagnostics.Tracing.Parsers;
[Module("profiled-module", "Performance Profiled Module", "1.0.0")]
public class ProfiledModule : ModuleBase
{
private readonly ActivitySource _activitySource;
private readonly DiagnosticSource _diagnosticSource;
public ProfiledModule()
{
// Initialize activity source for distributed tracing
_activitySource = new ActivitySource("Nexus.Modules.ProfiledModule", "1.0.0");
_diagnosticSource = new DiagnosticListener("ProfiledModule");
}
// Enable EventPipe profiling
protected override void OnInitialized()
{
// Start CPU profiling session
var providers = new List
{
new EventPipeProvider("Microsoft-Windows-DotNETRuntime",
EventLevel.Informational,
(long)ClrTraceEventParser.Keywords.GC |
(long)ClrTraceEventParser.Keywords.Exception |
(long)ClrTraceEventParser.Keywords.Jit),
new EventPipeProvider("System.Runtime",
EventLevel.Informational)
};
// Enable profiling in development
#if DEBUG
EventPipeSession session = EventPipe.Enable(providers);
#endif
// Custom performance counters
InitializePerformanceCounters();
}
// Method-level profiling with activities
public async Task ProcessDataAsync(SensorData data)
{
using var activity = _activitySource.StartActivity("ProcessData");
activity?.SetTag("sensor.id", data.SensorId);
activity?.SetTag("data.size", data.Size);
var stopwatch = Stopwatch.StartNew();
try
{
// Pre-processing
using (var preprocessActivity = _activitySource.StartActivity("PreProcess"))
{
await PreprocessDataAsync(data);
}
// Main processing
using (var processActivity = _activitySource.StartActivity("MainProcess"))
{
var result = await AnalyzeDataAsync(data);
processActivity?.SetTag("result.score", result.Score);
}
// Record metrics
RecordPerformanceMetrics("ProcessData", stopwatch.ElapsedMilliseconds);
}
catch (Exception ex)
{
activity?.SetStatus(ActivityStatusCode.Error, ex.Message);
throw;
}
}
// Memory profiling helpers
private void ProfileMemoryUsage()
{
var gcInfo = GC.GetMemoryInfo();
var gen0 = GC.CollectionCount(0);
var gen1 = GC.CollectionCount(1);
var gen2 = GC.CollectionCount(2);
Logger.LogDebug("Memory Profile: Heap={HeapMB}MB, Gen0={Gen0}, Gen1={Gen1}, Gen2={Gen2}",
gcInfo.HeapSizeBytes / 1024 / 1024,
gen0, gen1, gen2);
// Detailed allocation profiling
if (_diagnosticSource.IsEnabled("MemoryProfile"))
{
_diagnosticSource.Write("MemoryProfile", new
{
HeapSize = gcInfo.HeapSizeBytes,
HighMemoryLoadThreshold = gcInfo.HighMemoryLoadThresholdBytes,
TotalAvailableMemory = gcInfo.TotalAvailableMemoryBytes,
FragmentedBytes = gcInfo.FragmentedBytes,
Gen0Collections = gen0,
Gen1Collections = gen1,
Gen2Collections = gen2
});
}
}
}
// Profiling session configuration
public class ProfilingConfiguration
{
public static void EnableProfiling(IHostBuilder hostBuilder)
{
hostBuilder.ConfigureServices(services =>
{
// Add performance monitoring
services.AddSingleton();
// Configure OpenTelemetry
services.AddOpenTelemetryTracing(builder =>
{
builder
.SetResourceBuilder(ResourceBuilder.CreateDefault()
.AddService("nexus-module"))
.AddSource("Nexus.Modules.*")
.AddAspNetCoreInstrumentation()
.AddHttpClientInstrumentation()
.AddJaegerExporter(options =>
{
options.AgentHost = "localhost";
options.AgentPort = 6831;
});
});
});
}
}
// Command-line profiling with dotnet-trace
/*
# CPU profiling
dotnet-trace collect --process-id $(pidof dotnet) \
--profile cpu-sampling \
--duration 00:00:30 \
--output trace.nettrace
# Memory profiling
dotnet-trace collect --process-id $(pidof dotnet) \
--profile gc-verbose \
--duration 00:00:30 \
--output memory.nettrace
# Custom events
dotnet-trace collect --process-id $(pidof dotnet) \
--providers Microsoft-Windows-DotNETRuntime:0x1F000080018:5 \
--output custom.nettrace
# Convert to speedscope format
dotnet-trace convert trace.nettrace --format speedscope
*/
# Python Performance Profiling Setup
import cProfile
import pstats
import tracemalloc
import asyncio
from contextlib import contextmanager
from functools import wraps
import pyinstrument
from memory_profiler import profile as memory_profile
from line_profiler import LineProfiler
@module("profiled-module", "Performance Profiled Module", "1.0.0")
class ProfiledModule(Module):
def __init__(self):
super().__init__()
# Initialize profilers
self.cpu_profiler = cProfile.Profile()
self.pyinstrument = pyinstrument.Profiler()
self.line_profiler = LineProfiler()
# Start memory tracking
tracemalloc.start()
# Performance metrics
self.metrics = {
'message_processing_times': [],
'memory_snapshots': []
}
# CPU profiling decorator
def profile_cpu(func):
@wraps(func)
async def wrapper(self, *args, **kwargs):
if self.config.get('enable_profiling', False):
self.cpu_profiler.enable()
try:
result = await func(self, *args, **kwargs)
return result
finally:
self.cpu_profiler.disable()
# Save profile periodically
if len(self.metrics['message_processing_times']) % 1000 == 0:
self.save_cpu_profile()
else:
return await func(self, *args, **kwargs)
return wrapper
# Memory profiling with tracemalloc
@contextmanager
def profile_memory(self, label):
snapshot_before = tracemalloc.take_snapshot()
yield
snapshot_after = tracemalloc.take_snapshot()
# Compare snapshots
stats = snapshot_after.compare_to(snapshot_before, 'lineno')
self.logger.debug(f"Memory profile for {label}:")
for stat in stats[:10]: # Top 10 allocations
self.logger.debug(f" {stat}")
# Store snapshot for analysis
self.metrics['memory_snapshots'].append({
'label': label,
'timestamp': datetime.now(),
'snapshot': snapshot_after
})
# Line-level profiling for hot paths
@profile_cpu
async def process_sensor_data(self, data):
# Profile specific functions
self.line_profiler.add_function(self.validate_data)
self.line_profiler.add_function(self.transform_data)
self.line_profiler.add_function(self.analyze_data)
with self.profile_memory("process_sensor_data"):
# Validation
is_valid = await self.validate_data(data)
if not is_valid:
return None
# Transformation
transformed = await self.transform_data(data)
# Analysis
result = await self.analyze_data(transformed)
return result
# Async profiling with pyinstrument
async def profile_async_operation(self):
self.pyinstrument.start()
try:
# Simulate complex async operation
tasks = []
for i in range(10):
tasks.append(self.async_subtask(i))
results = await asyncio.gather(*tasks)
return results
finally:
self.pyinstrument.stop()
# Get profiling results
output = self.pyinstrument.output_text(
unicode=True,
show_all=True,
timeline=True
)
self.logger.debug(f"Async profile:\n{output}")
# Memory leak detection
def check_memory_leaks(self):
# Take snapshot
snapshot = tracemalloc.take_snapshot()
# Get top memory consumers
top_stats = snapshot.statistics('traceback')
self.logger.info("Top 10 memory allocations:")
for index, stat in enumerate(top_stats[:10], 1):
self.logger.info(f"#{index}: {stat.count} blocks, "
f"{stat.size / 1024 / 1024:.1f} MB")
for line in stat.traceback.format():
self.logger.debug(f" {line}")
# Production profiling with sampling
def enable_production_profiling(self):
import py_spy
# Low-overhead sampling profiler
profiler = py_spy.Profiler(
pid=os.getpid(),
rate=100, # 100 Hz sampling
native=False
)
# Profile for 60 seconds every hour
async def profile_periodically():
while True:
await asyncio.sleep(3600) # Wait 1 hour
self.logger.info("Starting production profiling")
profiler.start()
await asyncio.sleep(60) # Profile for 1 minute
profiler.stop()
# Save flame graph
profiler.dump_flamegraph(
f"/tmp/nexus_profile_{datetime.now().isoformat()}.svg"
)
asyncio.create_task(profile_periodically())
# Save profiling results
def save_cpu_profile(self):
stats = pstats.Stats(self.cpu_profiler)
stats.sort_stats('cumulative')
# Save to file
filename = f"profile_{self.module_id}_{datetime.now().isoformat()}.prof"
stats.dump_stats(filename)
# Log top functions
self.logger.info("Top 20 time-consuming functions:")
stats.print_stats(20)
# Usage with different profiling modes
if __name__ == "__main__":
import sys
module = ProfiledModule()
if "--profile-cpu" in sys.argv:
# CPU profiling mode
cProfile.run('asyncio.run(module.run())', 'module.prof')
elif "--profile-memory" in sys.argv:
# Memory profiling mode
from memory_profiler import memory_usage
mem_usage = memory_usage(module.run)
print(f"Memory usage: {max(mem_usage)} MB")
elif "--profile-lines" in sys.argv:
# Line profiling mode
lp = LineProfiler()
lp.add_function(module.process_sensor_data)
lp.enable()
asyncio.run(module.run())
lp.disable()
lp.print_stats()
else:
# Normal run
asyncio.run(module.run())
// C++ Performance Profiling Setup
#include
#include
#include
#include
#include
class ProfiledModule : public nexus::ModuleBase {
private:
struct PerfStats {
std::atomic message_count{0};
std::atomic total_processing_ns{0};
std::atomic cache_misses{0};
std::atomic branch_mispredicts{0};
};
PerfStats stats_;
int papi_event_set_ = PAPI_NULL;
public:
ProfiledModule() : ModuleBase("profiled-module", "Performance Profiled Module", "1.0.0") {
// Initialize PAPI for hardware counters
if (PAPI_library_init(PAPI_VER_CURRENT) != PAPI_VER_CURRENT) {
logger()->error("PAPI initialization failed");
}
// Create event set for hardware counters
PAPI_create_eventset(&papi_event_set_);
PAPI_add_event(papi_event_set_, PAPI_L1_DCM); // L1 data cache misses
PAPI_add_event(papi_event_set_, PAPI_BR_MSP); // Branch mispredictions
// Enable heap profiling in debug mode
#ifdef DEBUG
HeapProfilerStart("nexus_module_heap");
#endif
}
~ProfiledModule() {
#ifdef DEBUG
HeapProfilerStop();
#endif
}
protected:
void on_initialized() override {
// Start CPU profiling if requested
if (config().get("enable_cpu_profiling", false)) {
ProfilerStart("nexus_module_cpu.prof");
}
// Enable perf integration
if (config().get("enable_perf", false)) {
enable_perf_profiling();
}
}
private:
// High-resolution timing with RAII
class ScopedTimer {
ProfiledModule* module_;
std::string operation_;
std::chrono::high_resolution_clock::time_point start_;
long long counters_start_[2];
public:
ScopedTimer(ProfiledModule* module, const std::string& operation)
: module_(module), operation_(operation),
start_(std::chrono::high_resolution_clock::now()) {
// Start hardware counters
if (module_->papi_event_set_ != PAPI_NULL) {
PAPI_start(module_->papi_event_set_);
PAPI_read(module_->papi_event_set_, counters_start_);
}
// Callgrind instrumentation
CALLGRIND_START_INSTRUMENTATION;
}
~ScopedTimer() {
CALLGRIND_STOP_INSTRUMENTATION;
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast(
end - start_).count();
// Read hardware counters
if (module_->papi_event_set_ != PAPI_NULL) {
long long counters_end[2];
PAPI_stop(module_->papi_event_set_, counters_end);
module_->stats_.cache_misses +=
counters_end[0] - counters_start_[0];
module_->stats_.branch_mispredicts +=
counters_end[1] - counters_start_[1];
}
module_->stats_.total_processing_ns += duration;
module_->stats_.message_count++;
// Log slow operations
if (duration > 1000000) { // > 1ms
module_->logger()->warn("{} took {}ms", operation_,
duration / 1000000.0);
}
}
};
// Profile message processing
void handle_message(const nexus::Message& message) override {
ScopedTimer timer(this, "handle_message");
// Simulate processing with different code paths
if (message.topic.starts_with("sensor.")) {
process_sensor_message(message);
} else if (message.topic.starts_with("command.")) {
process_command_message(message);
}
}
// Memory profiling helpers
void profile_memory_usage() {
// Manual heap profiling checkpoint
#ifdef DEBUG
HeapProfilerDump("checkpoint");
#endif
// Get memory statistics
struct mallinfo2 mi = mallinfo2();
logger()->debug("Memory stats: arena={}MB, ordblks={}, hblks={}",
mi.arena / 1024 / 1024, mi.ordblks, mi.hblks);
// Custom memory tracking
size_t rss = get_rss();
size_t vsize = get_vsize();
logger()->debug("Process memory: RSS={}MB, VSIZE={}MB",
rss / 1024 / 1024, vsize / 1024 / 1024);
}
// Sampling profiler integration
void enable_perf_profiling() {
// Generate perf map for JIT code
std::ofstream perf_map(fmt::format("/tmp/perf-{}.map", getpid()));
// Write module symbols
void* start = reinterpret_cast(&ProfiledModule::handle_message);
void* end = reinterpret_cast(&ProfiledModule::profile_memory_usage);
perf_map << fmt::format("{:x} {:x} ProfiledModule::handle_message\n",
reinterpret_cast(start),
reinterpret_cast(end) - reinterpret_cast(start));
}
// Flame graph generation helper
void generate_flame_graph() {
system("perf record -F 99 -p $(pidof nexus_module) -g -- sleep 30");
system("perf script | stackcollapse-perf.pl | flamegraph.pl > flame.svg");
}
public:
// Expose performance statistics
json get_performance_stats() const {
auto count = stats_.message_count.load();
auto total_ns = stats_.total_processing_ns.load();
return {
{"message_count", count},
{"avg_latency_us", count > 0 ? (total_ns / count / 1000.0) : 0},
{"cache_misses", stats_.cache_misses.load()},
{"branch_mispredicts", stats_.branch_mispredicts.load()},
{"cache_miss_rate", count > 0 ?
(double)stats_.cache_misses / count : 0},
{"branch_mispredict_rate", count > 0 ?
(double)stats_.branch_mispredicts / count : 0}
};
}
};
// Profiling with Linux perf
/*
# CPU profiling
perf record -F 99 -p $(pidof nexus_module) -g -- sleep 30
perf report
# Cache profiling
perf stat -e cache-references,cache-misses,instructions,cycles \
-p $(pidof nexus_module) -- sleep 10
# Lock contention profiling
perf lock record -p $(pidof nexus_module) -- sleep 30
perf lock report
# Memory profiling with valgrind
valgrind --tool=massif --massif-out-file=massif.out ./nexus_module
ms_print massif.out > massif.txt
# CPU flame graphs
perf record -F 999 -p $(pidof nexus_module) -g -- sleep 30
perf script | ./FlameGraph/stackcollapse-perf.pl | \
./FlameGraph/flamegraph.pl > cpu_flame.svg
*/
classdef ProfiledModule < NexusModule
properties (Access = private)
performanceData
profilingEnabled
memorySnapshots
executionTimes
end
methods
function obj = ProfiledModule()
obj@NexusModule('profiled-module', 'Performance Profiled Module', '1.0.0');
% Initialize performance tracking
obj.performanceData = struct(...
'messageCount', 0, ...
'totalProcessingTime', 0, ...
'peakMemory', 0, ...
'functionCalls', containers.Map() ...
);
obj.memorySnapshots = [];
obj.executionTimes = [];
obj.profilingEnabled = false;
end
function enableProfiling(obj)
% Enable MATLAB profiler
obj.profilingEnabled = true;
profile on -history -memory;
% Start memory monitoring
obj.startMemoryMonitoring();
% Enable JVM profiling if available
if usejava('jvm')
java.lang.management.ManagementFactory.getMemoryMXBean();
end
end
% Profile message processing
function processMessage(obj, message)
if obj.profilingEnabled
% Start timing
ticID = tic;
% Memory before
memBefore = memory;
% Process with profiling
try
result = obj.processMessageInternal(message);
% Record timing
elapsed = toc(ticID);
obj.recordTiming('processMessage', elapsed);
% Memory after
memAfter = memory;
obj.recordMemoryUsage(memBefore, memAfter);
catch ME
% Log error with performance context
elapsed = toc(ticID);
obj.logger.error(sprintf(...
'Message processing failed after %.2fms: %s', ...
elapsed * 1000, ME.message));
rethrow(ME);
end
else
% Normal processing without profiling
result = obj.processMessageInternal(message);
end
end
% Detailed function profiling
function result = profileFunction(obj, funcHandle, funcName, varargin)
if obj.profilingEnabled
% Use MATLAB's built-in profiler
profile on;
% Time the function
t = timeit(@() funcHandle(varargin{:}));
% Get profile info
p = profile('info');
profile off;
% Find function in profile data
funcData = [];
for i = 1:length(p.FunctionTable)
if contains(p.FunctionTable(i).FunctionName, funcName)
funcData = p.FunctionTable(i);
break;
end
end
if ~isempty(funcData)
obj.logger.debug(sprintf(...
'%s: Total=%.2fms, Self=%.2fms, Calls=%d', ...
funcName, ...
funcData.TotalTime * 1000, ...
funcData.SelfTime * 1000, ...
funcData.NumCalls));
end
% Execute and return result
result = funcHandle(varargin{:});
else
result = funcHandle(varargin{:});
end
end
% Memory profiling
function startMemoryMonitoring(obj)
% Create timer for periodic memory snapshots
obj.memoryTimer = timer(...
'Period', 5, ... % Every 5 seconds
'ExecutionMode', 'fixedRate', ...
'TimerFcn', @(~,~) obj.takeMemorySnapshot() ...
);
start(obj.memoryTimer);
end
function takeMemorySnapshot(obj)
% Get current memory usage
mem = memory;
% JVM memory if available
if usejava('jvm')
runtime = java.lang.Runtime.getRuntime();
jvmMem = struct(...
'total', runtime.totalMemory() / 1024 / 1024, ...
'free', runtime.freeMemory() / 1024 / 1024, ...
'max', runtime.maxMemory() / 1024 / 1024 ...
);
else
jvmMem = struct('total', 0, 'free', 0, 'max', 0);
end
% Store snapshot
snapshot = struct(...
'timestamp', datetime('now'), ...
'matlabUsed', mem.MemUsedMATLAB / 1024 / 1024, ... % MB
'matlabAvailable', mem.MemAvailableAllArrays / 1024 / 1024, ...
'systemAvailable', mem.PhysicalMemory.Available / 1024 / 1024, ...
'jvmUsed', jvmMem.total - jvmMem.free, ...
'jvmMax', jvmMem.max ...
);
obj.memorySnapshots = [obj.memorySnapshots, snapshot];
% Detect memory leaks
if length(obj.memorySnapshots) > 10
recentSnapshots = obj.memorySnapshots(end-9:end);
memoryGrowth = recentSnapshots(end).matlabUsed - ...
recentSnapshots(1).matlabUsed;
if memoryGrowth > 100 % 100MB growth
obj.logger.warning(sprintf(...
'Potential memory leak: %+.1fMB growth in last 50s', ...
memoryGrowth));
end
end
end
% Code coverage analysis
function runWithCoverage(obj, testFunction)
% Enable code coverage
import matlab.unittest.plugins.CodeCoveragePlugin
import matlab.unittest.TestRunner
% Create test suite
suite = matlab.unittest.TestSuite.fromFunction(testFunction);
% Create test runner with coverage
runner = TestRunner.withTextOutput;
runner.addPlugin(CodeCoveragePlugin.forFolder(pwd));
% Run tests
results = runner.run(suite);
% Generate coverage report
generateHTMLReport(results.Plugins(1), 'coverage_report');
end
% Generate performance report
function report = generatePerformanceReport(obj)
if ~obj.profilingEnabled
error('Profiling not enabled');
end
% Get profiler data
p = profile('info');
profile off;
% Create report structure
report = struct();
% Top time consumers
[~, idx] = sort([p.FunctionTable.TotalTime], 'descend');
topFunctions = p.FunctionTable(idx(1:min(20, length(idx))));
report.topFunctions = arrayfun(@(f) struct(...
'name', f.FunctionName, ...
'totalTime', f.TotalTime, ...
'selfTime', f.SelfTime, ...
'calls', f.NumCalls ...
), topFunctions);
% Memory analysis
if ~isempty(obj.memorySnapshots)
report.memoryStats = struct(...
'peakMemory', max([obj.memorySnapshots.matlabUsed]), ...
'averageMemory', mean([obj.memorySnapshots.matlabUsed]), ...
'memoryGrowth', obj.memorySnapshots(end).matlabUsed - ...
obj.memorySnapshots(1).matlabUsed ...
);
end
% Execution time statistics
if obj.performanceData.functionCalls.Count > 0
keys = obj.performanceData.functionCalls.keys;
report.functionStats = cell(1, length(keys));
for i = 1:length(keys)
times = obj.performanceData.functionCalls(keys{i});
report.functionStats{i} = struct(...
'function', keys{i}, ...
'calls', length(times), ...
'meanTime', mean(times) * 1000, ... % ms
'maxTime', max(times) * 1000, ...
'minTime', min(times) * 1000 ...
);
end
end
% Generate HTML report
profsave(p, 'profile_results');
% Generate plots
obj.generatePerformancePlots();
end
% Visualization of performance data
function generatePerformancePlots(obj)
figure('Name', 'Module Performance Analysis');
% Memory usage over time
subplot(2, 2, 1);
if ~isempty(obj.memorySnapshots)
times = [obj.memorySnapshots.timestamp];
memory = [obj.memorySnapshots.matlabUsed];
plot(times, memory, 'b-', 'LineWidth', 2);
xlabel('Time');
ylabel('Memory (MB)');
title('Memory Usage Over Time');
grid on;
end
% Function execution times
subplot(2, 2, 2);
if obj.performanceData.functionCalls.Count > 0
keys = obj.performanceData.functionCalls.keys;
meanTimes = zeros(1, length(keys));
for i = 1:length(keys)
times = obj.performanceData.functionCalls(keys{i});
meanTimes(i) = mean(times) * 1000; % ms
end
bar(meanTimes);
set(gca, 'XTickLabel', keys, 'XTickLabelRotation', 45);
ylabel('Mean Time (ms)');
title('Function Execution Times');
grid on;
end
% Save figure
saveas(gcf, 'performance_analysis.png');
end
end
methods (Access = private)
function recordTiming(obj, functionName, elapsed)
if obj.performanceData.functionCalls.isKey(functionName)
times = obj.performanceData.functionCalls(functionName);
obj.performanceData.functionCalls(functionName) = [times, elapsed];
else
obj.performanceData.functionCalls(functionName) = elapsed;
end
obj.performanceData.messageCount = obj.performanceData.messageCount + 1;
obj.performanceData.totalProcessingTime = ...
obj.performanceData.totalProcessingTime + elapsed;
end
function recordMemoryUsage(obj, memBefore, memAfter)
memUsed = (memAfter.MemUsedMATLAB - memBefore.MemUsedMATLAB) / 1024 / 1024;
if memUsed > 10 % More than 10MB allocated
obj.logger.warning(sprintf(...
'Large memory allocation detected: %.1fMB', memUsed));
end
obj.performanceData.peakMemory = max(obj.performanceData.peakMemory, ...
memAfter.MemUsedMATLAB / 1024 / 1024);
end
end
end
// LabVIEW Performance Profiling Techniques
//
// 1. Built-in Profiling Tools:
// - Profile Performance and Memory Window
// - VI Profiler (Tools >> Profile >> Performance and Memory)
// - Desktop Execution Trace Toolkit
//
// 2. Enable Profiling:
// a) Open VI Properties
// b) Navigate to Execution category
// c) Check "Enable debugging" and "Enable Profile"
// d) Set "Profile memory usage" if needed
//
// 3. Performance Profiling Pattern:
//
// Main Module VI Structure:
// ┌─────────────────────────────────────────┐
// │ Profile Start │
// │ ├─ Get Tick Count (ms) │
// │ ├─ Memory Snapshot VI │
// │ └─ Reset Profile Metrics VI │
// ├─────────────────────────────────────────┤
// │ Module Initialization │
// │ ├─ Profile Point: "Init Start" │
// │ ├─ Initialize Resources │
// │ └─ Profile Point: "Init Complete" │
// ├─────────────────────────────────────────┤
// │ Main Processing Loop │
// │ ├─ Profile Point: "Loop Start" │
// │ ├─ Process Messages │
// │ ├─ Update Metrics │
// │ └─ Profile Point: "Loop End" │
// ├─────────────────────────────────────────┤
// │ Profile End │
// │ ├─ Calculate Total Time │
// │ ├─ Generate Report │
// │ └─ Save Profile Data │
// └─────────────────────────────────────────┘
//
// 4. Custom Profiling SubVIs:
//
// Profile Point.vi:
// Inputs:
// - Profile Name (string)
// - Enable Profiling (boolean)
// Outputs:
// - Error Out
// Implementation:
// - Get current tick count
// - Get memory info
// - Store in global profile data
//
// Memory Snapshot.vi:
// Inputs:
// - Snapshot Name (string)
// Outputs:
// - Memory Used (numeric)
// - Error Out
// Implementation:
// - Call Request Deallocation function
// - Get VI memory usage
// - Store snapshot data
//
// 5. Real-Time Profiling:
//
// For RT targets:
// - Use RT Execution Trace Toolkit
// - Configure trace sessions
// - Minimize profiling overhead
// - Use deterministic profiling
//
// 6. Memory Profiling:
//
// Memory Monitor Loop:
// ┌─────────────────────────────────────────┐
// │ While Loop (1Hz) │
// │ ├─ Get Memory Info │
// │ ├─ Check for Leaks │
// │ ├─ Log if Threshold Exceeded │
// │ └─ Update Memory Graph │
// └─────────────────────────────────────────┘
//
// 7. CPU Profiling:
//
// - Use Timed Loops for precise timing
// - Profile SubVI execution times
// - Identify hot paths
// - Monitor CPU usage percentage
//
// 8. Best Practices:
//
// - Profile in Development mode first
// - Use conditional disable structures
// - Minimize profiling in production
// - Store profile data efficiently
// - Use Producer/Consumer for logging
//
// 9. Analysis Tools:
//
// Profile Analysis.vi:
// - Load profile data
// - Calculate statistics
// - Generate timing charts
// - Identify bottlenecks
// - Export to Excel/CSV
//
// 10. Common Bottlenecks:
//
// - Array operations in loops
// - Excessive property nodes
// - Synchronous file I/O
// - UI thread blocking
// - Memory copies
//
// Example Profiling Configuration:
// [Nexus Module Settings]
// EnableProfiling=true
// ProfileMemory=true
// ProfileInterval=1000
// SaveProfileData=true
// ProfileDataPath=/logs/profile/
Key Metrics to Monitor
Performance Metrics
Metric | Description | Target Range | Collection Method |
---|---|---|---|
Message Processing Latency | Time from message receipt to completion | < 10ms (p99) | Timer around handler |
CPU Usage | Processor utilization percentage | < 70% sustained | OS performance counters |
Memory Allocation Rate | MB/s of new allocations | < 10 MB/s | Heap profiler |
Garbage Collection | GC pause time and frequency | < 50ms pauses | Runtime metrics |
Thread Pool Usage | Active vs available threads | < 80% utilization | Thread pool stats |
I/O Wait Time | Time blocked on I/O operations | < 20% of CPU time | System profiler |
Lock Contention | Time waiting for locks | < 5% of CPU time | Lock profiler |
Cache Hit Rate | L1/L2/L3 cache efficiency | > 90% L1 hits | Hardware counters |
Performance Metrics Collection
// Using SDK-provided metrics collection
public class PerformanceMonitoringModule : ModuleBase
{
private IHistogram _processingTime;
private ICounter _messageCount;
private IGauge _queueDepth;
private IHistogram _transformSize;
protected override async Task OnInitializeAsync()
{
// Create metrics using the SDK's built-in metrics system
_processingTime = Metrics.CreateHistogram(
"message_processing_duration_ms",
"Time taken to process messages",
new[] { "message_type", "status" }
);
_messageCount = Metrics.CreateCounter(
"messages_processed_total",
"Total number of messages processed",
new[] { "message_type", "status" }
);
_queueDepth = Metrics.CreateGauge(
"message_queue_depth",
"Current depth of message queue"
);
_transformSize = Metrics.CreateHistogram(
"transform_size_bytes",
"Size of transformed messages",
new[] { "message_type" }
);
}
// Usage in module
public async Task ProcessMessage(Message message)
{
// Automatically measure processing time
using (var timer = _processingTime.StartTimer(
labels: new[] { message.Type, "processing" }))
{
// Validation
using (Metrics.MeasureDuration("message_validation_duration_ms"))
{
ValidateMessage(message);
}
// Processing
var result = await TransformMessage(message);
// Record custom metrics
_transformSize.Observe(result.Size, message.Type);
_messageCount.Increment(message.Type, "success");
// Update gauge
_queueDepth.Set(GetCurrentQueueDepth());
return result;
}
}
// SDK automatically exports metrics - no manual export needed
// Metrics are available via:
// - Prometheus endpoint: /metrics
// - OpenTelemetry export
// - Built-in dashboards
// For custom metric queries
public async Task GetMetricSnapshot(string metricName)
{
return await Metrics.GetSnapshot(metricName);
}
// For alerting based on metrics
protected override async Task OnMetricThresholdExceeded(MetricAlert alert)
{
if (alert.MetricName == "message_processing_duration_ms" &&
alert.Value > 1000)
{
Logger.Warning("Processing time exceeded 1 second: {Value}ms", alert.Value);
// Trigger adaptive behavior
await EnableDegradedMode();
}
}
}
Profiling Techniques
Sampling vs Instrumentation
Sampling Profilers
- Low Overhead: Minimal impact on performance (~1-5%)
- Statistical: May miss short-lived functions
- Production Safe: Can run continuously
- Best For: Overall performance analysis, hot spot identification
Instrumentation Profilers
- Precise: Exact call counts and timing
- High Overhead: Can slow down execution significantly
- Detailed: Full call graph information
- Best For: Detailed analysis, debugging specific issues
Continuous Profiling
// Continuous profiling in production
public class ContinuousProfiler
{
private readonly Timer _profilingTimer;
private readonly string _outputPath;
private bool _isProfileActive;
public ContinuousProfiler(string outputPath)
{
_outputPath = outputPath;
// Profile for 30 seconds every hour
_profilingTimer = new Timer(
callback: _ => ProfileAsync().Wait(),
state: null,
dueTime: TimeSpan.FromMinutes(1),
period: TimeSpan.FromHours(1)
);
}
private async Task ProfileAsync()
{
if (_isProfileActive) return;
_isProfileActive = true;
var timestamp = DateTime.UtcNow.ToString("yyyyMMdd_HHmmss");
try
{
// CPU profiling
var cpuFile = Path.Combine(_outputPath, $"cpu_{timestamp}.nettrace");
await CollectCpuProfile(cpuFile, TimeSpan.FromSeconds(30));
// Memory snapshot
var memFile = Path.Combine(_outputPath, $"memory_{timestamp}.gcdump");
await CollectMemoryDump(memFile);
// Convert to flame graph
await GenerateFlameGraph(cpuFile);
// Upload to monitoring system
await UploadProfiles(timestamp);
// Clean up old profiles
CleanupOldProfiles();
}
finally
{
_isProfileActive = false;
}
}
private async Task CollectCpuProfile(string outputFile, TimeSpan duration)
{
var providers = new List
{
new EventPipeProvider("Microsoft-Windows-DotNETRuntime",
EventLevel.Informational,
(long)ClrTraceEventParser.Keywords.Default),
new EventPipeProvider("Microsoft-DotNETCore-SampleProfiler",
EventLevel.Informational)
};
var client = new DiagnosticsClient(Process.GetCurrentProcess().Id);
using var session = client.StartEventPipeSession(providers, false);
var collectTask = Task.Run(async () =>
{
using var stream = File.Create(outputFile);
await session.EventStream.CopyToAsync(stream);
});
await Task.Delay(duration);
session.Stop();
await collectTask;
}
private async Task GenerateFlameGraph(string traceFile)
{
var process = new Process
{
StartInfo = new ProcessStartInfo
{
FileName = "dotnet-trace",
Arguments = $"convert {traceFile} --format speedscope",
UseShellExecute = false,
RedirectStandardOutput = true
}
};
process.Start();
await process.WaitForExitAsync();
}
}
Performance Analysis
Identifying Bottlenecks
CPU Bottlenecks
- High CPU usage with low throughput
- Hot functions consuming >10% CPU
- Excessive context switching
- Poor algorithm complexity (O(n²) or worse)
Memory Bottlenecks
- Frequent garbage collections
- Large object heap fragmentation
- Memory leaks (growing heap size)
- Excessive allocations in hot paths
I/O Bottlenecks
- High I/O wait times
- Synchronous I/O in async contexts
- Inefficient database queries
- Network latency issues
Flame Graph Analysis
Memory Leak Detection
// Memory leak detection patterns
public class MemoryLeakDetector
{
private readonly List _trackedObjects = new();
private readonly Timer _gcTimer;
private long _lastHeapSize;
public MemoryLeakDetector()
{
_gcTimer = new Timer(_ => CheckForLeaks(), null,
TimeSpan.FromMinutes(5), TimeSpan.FromMinutes(5));
}
public void TrackObject(object obj)
{
_trackedObjects.Add(new WeakReference(obj));
}
private void CheckForLeaks()
{
// Force full GC
GC.Collect(2, GCCollectionMode.Forced);
GC.WaitForPendingFinalizers();
GC.Collect(2, GCCollectionMode.Forced);
// Check tracked objects
var aliveCount = _trackedObjects.Count(wr => wr.IsAlive);
if (aliveCount > 100)
{
Logger.Warning($"Potential leak: {aliveCount} objects still alive");
// Analyze alive objects
var aliveTypes = _trackedObjects
.Where(wr => wr.IsAlive)
.Select(wr => wr.Target?.GetType().Name)
.GroupBy(t => t)
.OrderByDescending(g => g.Count())
.Take(10);
foreach (var typeGroup in aliveTypes)
{
Logger.Warning($" {typeGroup.Key}: {typeGroup.Count()} instances");
}
}
// Check heap growth
var currentHeap = GC.GetTotalMemory(false);
var growth = currentHeap - _lastHeapSize;
if (_lastHeapSize > 0 && growth > 50_000_000) // 50MB growth
{
Logger.Warning($"Heap grew by {growth / 1_000_000}MB");
// Take memory dump
if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
{
CollectMemoryDump();
}
}
_lastHeapSize = currentHeap;
// Clean up dead references
_trackedObjects.RemoveAll(wr => !wr.IsAlive);
}
private void CollectMemoryDump()
{
var dumpFile = $"memdump_{DateTime.Now:yyyyMMdd_HHmmss}.dmp";
var process = Process.GetCurrentProcess();
// Use createdump tool
var createdump = new Process
{
StartInfo = new ProcessStartInfo
{
FileName = "createdump",
Arguments = $"--full {process.Id} --name {dumpFile}",
UseShellExecute = false
}
};
createdump.Start();
createdump.WaitForExit();
Logger.Info($"Memory dump saved to {dumpFile}");
}
}
Integration with NEXUS-1
Built-in Performance Counters
NEXUS-1 provides performance counters that modules can use to report metrics.
// Using NEXUS-1 performance counters
public class InstrumentedModule : ModuleBase
{
private readonly IPerformanceCounters _counters;
public InstrumentedModule(IModuleContext context) : base(context)
{
// Register performance counters
_counters = context.PerformanceCounters;
_counters.RegisterCounter("messages_processed", CounterType.Counter,
"Total messages processed");
_counters.RegisterCounter("processing_time_ms", CounterType.Histogram,
"Message processing time in milliseconds");
_counters.RegisterCounter("active_connections", CounterType.Gauge,
"Number of active connections");
_counters.RegisterCounter("errors_total", CounterType.Counter,
"Total number of errors");
}
protected override async Task OnMessageAsync(Message message)
{
using var timer = _counters.StartTimer("processing_time_ms");
try
{
await ProcessMessageInternal(message);
_counters.Increment("messages_processed");
}
catch (Exception ex)
{
_counters.Increment("errors_total");
_counters.Increment($"errors_total", new[] {
("error_type", ex.GetType().Name)
});
throw;
}
}
// Report custom metrics
private async Task ReportMetrics()
{
var metrics = new ModuleMetrics
{
ModuleId = ModuleId,
Timestamp = DateTime.UtcNow,
Counters = _counters.GetSnapshot(),
CustomMetrics = new Dictionary
{
["queue_depth"] = GetQueueDepth(),
["cache_hit_rate"] = CalculateCacheHitRate(),
["memory_mb"] = GC.GetTotalMemory(false) / 1024.0 / 1024.0
}
};
await Messages.PublishAsync("metrics.module", metrics);
}
}
// Metrics aggregation in NEXUS-1
public class MetricsAggregator : ModuleBase
{
private readonly Dictionary _latestMetrics = new();
protected override void OnInitialized()
{
Messages.Subscribe("metrics.module", async message =>
{
var metrics = message.GetPayload();
_latestMetrics[metrics.ModuleId] = metrics;
// Aggregate and publish system-wide metrics
await PublishAggregatedMetrics();
});
}
private async Task PublishAggregatedMetrics()
{
var aggregated = new SystemMetrics
{
Timestamp = DateTime.UtcNow,
TotalMessages = _latestMetrics.Values
.Sum(m => m.Counters["messages_processed"]),
AverageProcessingTime = _latestMetrics.Values
.Average(m => m.Counters["processing_time_ms_p50"]),
TotalErrors = _latestMetrics.Values
.Sum(m => m.Counters["errors_total"]),
ModuleCount = _latestMetrics.Count,
HealthyModules = _latestMetrics.Values
.Count(m => m.IsHealthy)
};
await Messages.PublishAsync("metrics.system", aggregated);
}
}
Performance Dashboards
Create real-time dashboards to monitor module performance.
Grafana Integration
# Grafana dashboard query examples
# Message processing rate
rate(nexus_messages_processed_total[5m])
# P99 latency
histogram_quantile(0.99,
rate(nexus_processing_time_ms_bucket[5m]))
# Error rate
rate(nexus_errors_total[5m]) /
rate(nexus_messages_processed_total[5m])
# Memory usage by module
nexus_module_memory_mb{module_id=~"$module"}
# CPU usage
rate(process_cpu_seconds_total[5m]) * 100
Alert Rules
# Prometheus alert rules
groups:
- name: nexus_performance
rules:
- alert: HighMessageLatency
expr: |
histogram_quantile(0.99,
rate(nexus_processing_time_ms_bucket[5m])
) > 100
for: 5m
labels:
severity: warning
annotations:
summary: "High message processing latency"
- alert: MemoryLeak
expr: |
rate(nexus_module_memory_mb[30m]) > 0.1
for: 30m
labels:
severity: critical
annotations:
summary: "Potential memory leak detected"
Best Practices
Profiling Guidelines
- Profile Early and Often: Don't wait for performance problems
- Establish Baselines: Know your normal performance characteristics
- Use Production-Like Data: Profile with realistic workloads
- Profile in Context: Consider the entire system, not just your module
- Automate Performance Tests: Include in CI/CD pipeline
- Monitor Continuously: Use low-overhead profiling in production
- Document Findings: Keep records of optimizations and their impact
- Profile Before and After: Measure the impact of changes
Common Pitfalls
- Premature Optimization: Profile first, optimize based on data
- Micro-Benchmarks: Can be misleading, profile real scenarios
- Debug vs Release: Always profile release builds
- Single Metrics: Look at multiple metrics together
- Profiler Overhead: Account for profiling impact
- Local vs Production: Production behavior may differ
Performance Regression Detection
// Automated performance regression testing
[TestClass]
public class PerformanceTests
{
[TestMethod]
[PerformanceBenchmark("MessageProcessing", MaxDuration = 10)]
public async Task ProcessMessage_Performance()
{
// Arrange
var module = new OptimizedModule();
var message = GenerateTestMessage(1000);
// Act & Assert
await BenchmarkRunner.Run(async () =>
{
await module.ProcessMessage(message);
}, options =>
{
options.Iterations = 1000;
options.WarmupIterations = 100;
options.MaxDuration = TimeSpan.FromMilliseconds(10);
options.MaxMemoryIncrease = 1_000_000; // 1MB
options.MaxGCGen2Collections = 1;
});
}
}
[AttributeUsage(AttributeTargets.Method)]
public class PerformanceBenchmarkAttribute : Attribute
{
public string Name { get; }
public double MaxDuration { get; set; }
public long MaxMemory { get; set; }
public PerformanceBenchmarkAttribute(string name)
{
Name = name;
}
}
public static class BenchmarkRunner
{
public static async Task Run(Func action, Action configure)
{
var options = new BenchmarkOptions();
configure(options);
// Warmup
for (int i = 0; i < options.WarmupIterations; i++)
{
await action();
}
// Measure
var results = new List();
var sw = new Stopwatch();
var initialMemory = GC.GetTotalMemory(true);
var initialGen2 = GC.CollectionCount(2);
for (int i = 0; i < options.Iterations; i++)
{
sw.Restart();
await action();
sw.Stop();
results.Add(new BenchmarkResult
{
Duration = sw.Elapsed,
MemoryUsed = GC.GetTotalMemory(false) - initialMemory
});
}
// Analyze results
var stats = new BenchmarkStatistics(results);
// Check against thresholds
Assert.IsTrue(stats.P99Duration < options.MaxDuration,
$"P99 duration {stats.P99Duration.TotalMilliseconds}ms exceeds max {options.MaxDuration.TotalMilliseconds}ms");
Assert.IsTrue(stats.MaxMemoryIncrease < options.MaxMemoryIncrease,
$"Memory increase {stats.MaxMemoryIncrease} exceeds max {options.MaxMemoryIncrease}");
var gen2Collections = GC.CollectionCount(2) - initialGen2;
Assert.IsTrue(gen2Collections <= options.MaxGCGen2Collections,
$"Gen2 collections {gen2Collections} exceeds max {options.MaxGCGen2Collections}");
// Report results
Console.WriteLine($"Performance Test Results:");
Console.WriteLine($" P50: {stats.P50Duration.TotalMilliseconds:F2}ms");
Console.WriteLine($" P99: {stats.P99Duration.TotalMilliseconds:F2}ms");
Console.WriteLine($" Memory: {stats.MaxMemoryIncrease / 1024}KB");
Console.WriteLine($" GC Gen2: {gen2Collections}");
}
}
Advanced Watchdog System
Implement sophisticated monitoring and automatic intervention patterns using the NEXUS-1 SDK's watchdog capabilities. Build systems that detect, diagnose, and recover from various failure modes automatically.
Watchdog System Overview
Why Advanced Watchdogs?
Beyond basic health checks, advanced watchdog systems provide:
- Deadlock Detection: Identify and resolve circular dependencies and resource contention
- Performance Anomaly Detection: Catch gradual degradation before it impacts users
- Resource Leak Prevention: Monitor and cap runaway resource consumption
- Cascade Failure Prevention: Stop failures from propagating through the system
- Intelligent Recovery: Context-aware recovery strategies based on failure patterns
Core Watchdog Patterns
Multi-Level Watchdog Architecture
Implement hierarchical monitoring with different intervention levels based on severity.
public class AdvancedWatchdogModule : ModuleBase
{
private readonly Dictionary _watchdogs;
private readonly WatchdogCoordinator _coordinator;
private readonly IRecoveryEngine _recoveryEngine;
public AdvancedWatchdogModule()
{
_watchdogs = new Dictionary();
_coordinator = new WatchdogCoordinator();
_recoveryEngine = Recovery.CreateRecoveryEngine();
// Configure watchdog hierarchy
ConfigureWatchdogs();
}
private void ConfigureWatchdogs()
{
// Level 1: Process Health Watchdog
var processWatchdog = new ProcessWatchdog(
checkInterval: TimeSpan.FromSeconds(5),
timeout: TimeSpan.FromSeconds(30)
);
processWatchdog.OnUnresponsive += HandleUnresponsiveProcess;
_watchdogs["process"] = processWatchdog;
// Level 2: Resource Watchdog
var resourceWatchdog = new ResourceWatchdog(
cpuThreshold: 90,
memoryThreshold: 85,
checkInterval: TimeSpan.FromSeconds(10)
);
resourceWatchdog.OnThresholdExceeded += HandleResourceExceeded;
_watchdogs["resource"] = resourceWatchdog;
// Level 3: Deadlock Watchdog
var deadlockWatchdog = new DeadlockWatchdog(
detectionInterval: TimeSpan.FromSeconds(30),
maxWaitTime: TimeSpan.FromMinutes(2)
);
deadlockWatchdog.OnDeadlockDetected += HandleDeadlock;
_watchdogs["deadlock"] = deadlockWatchdog;
// Level 4: Performance Watchdog
var performanceWatchdog = new PerformanceWatchdog(
baselineWindow: TimeSpan.FromHours(1),
deviationThreshold: 2.0 // 2x standard deviation
);
performanceWatchdog.OnAnomalyDetected += HandlePerformanceAnomaly;
_watchdogs["performance"] = performanceWatchdog;
}
protected override async Task OnInitializeAsync()
{
// Start all watchdogs
foreach (var watchdog in _watchdogs.Values)
{
await watchdog.StartAsync();
}
// Register composite health check
Health.AddCheck("watchdog-system", async () =>
{
var results = await Task.WhenAll(
_watchdogs.Values.Select(w => w.CheckHealthAsync())
);
if (results.All(r => r.Status == HealthStatus.Healthy))
return HealthCheckResult.Healthy("All watchdogs operational");
if (results.Any(r => r.Status == HealthStatus.Unhealthy))
return HealthCheckResult.Unhealthy("One or more watchdogs failed");
return HealthCheckResult.Degraded("Some watchdogs degraded");
});
// Subscribe to module lifecycle events
await Messages.SubscribeAsync("module.started", OnModuleStarted);
await Messages.SubscribeAsync("module.stopped", OnModuleStopped);
}
// Deadlock Detection and Resolution
private async Task HandleDeadlock(DeadlockInfo deadlock)
{
Logger.Critical($"Deadlock detected: {deadlock.Description}");
// Record in audit log
await Audit.SecurityEvent("DeadlockDetected")
.WithProperty("involvedModules", deadlock.InvolvedModules)
.WithProperty("resources", deadlock.LockedResources)
.WithProperty("duration", deadlock.Duration)
.RecordAsync();
// Attempt resolution strategies
var strategies = new IDeadlockResolutionStrategy[]
{
new TimeoutOldestStrategy(),
new RollbackTransactionStrategy(),
new RestartModuleStrategy(),
new EscalateStrategy()
};
foreach (var strategy in strategies)
{
try
{
var resolved = await strategy.ResolveAsync(deadlock);
if (resolved)
{
Logger.Info($"Deadlock resolved using {strategy.Name}");
// Notify system
await Messages.PublishAsync("watchdog.deadlock.resolved", new
{
DeadlockId = deadlock.Id,
Strategy = strategy.Name,
Timestamp = DateTime.UtcNow
});
return;
}
}
catch (Exception ex)
{
Logger.Error($"Strategy {strategy.Name} failed", ex);
}
}
// If all strategies fail, emergency shutdown
await EmergencyShutdown(deadlock);
}
// Resource Monitoring and Capping
private async Task HandleResourceExceeded(ResourceAlert alert)
{
Logger.Warning($"Resource threshold exceeded: {alert.ResourceType} at {alert.CurrentValue}%");
// Apply resource capping
switch (alert.ResourceType)
{
case ResourceType.CPU:
await ApplyCpuThrottling(alert.ModuleId, alert.CurrentValue);
break;
case ResourceType.Memory:
await ApplyMemoryPressure(alert.ModuleId, alert.CurrentValue);
break;
case ResourceType.IO:
await ApplyIoRateLimiting(alert.ModuleId, alert.CurrentValue);
break;
}
// Update metrics
Metrics.RecordGauge($"watchdog.resource.{alert.ResourceType.ToString().ToLower()}_usage",
alert.CurrentValue,
new[] { "module", alert.ModuleId });
}
// Performance Anomaly Detection
private async Task HandlePerformanceAnomaly(PerformanceAnomaly anomaly)
{
Logger.Warning($"Performance anomaly detected: {anomaly.Type}");
// Collect diagnostic data
var diagnostics = await CollectDiagnostics(anomaly);
// Determine intervention level
var interventionLevel = DetermineInterventionLevel(anomaly);
switch (interventionLevel)
{
case InterventionLevel.Monitor:
// Just log and continue monitoring
await RecordAnomaly(anomaly, diagnostics);
break;
case InterventionLevel.Adjust:
// Make automatic adjustments
await ApplyPerformanceTuning(anomaly);
break;
case InterventionLevel.Degrade:
// Enable degraded mode
await EnableDegradedMode(anomaly.AffectedModules);
break;
case InterventionLevel.Restart:
// Restart affected components
await RestartAffectedComponents(anomaly);
break;
}
}
// Hung Process Detection
private async Task HandleUnresponsiveProcess(ProcessInfo process)
{
Logger.Error($"Process {process.ModuleId} is unresponsive");
// Try gentle recovery first
var recovered = await _recoveryEngine.TryRecoverAsync(
process.ModuleId,
RecoveryLevel.Gentle
);
if (!recovered)
{
// Escalate to forceful recovery
recovered = await _recoveryEngine.TryRecoverAsync(
process.ModuleId,
RecoveryLevel.Force
);
}
if (!recovered)
{
// Last resort: kill and restart
await KillAndRestartModule(process);
}
}
}
// Specialized Watchdog Implementations
public class DeadlockWatchdog : IWatchdog
{
private readonly ILockManager _lockManager;
private readonly TimeSpan _detectionInterval;
private readonly TimeSpan _maxWaitTime;
public async Task CheckHealthAsync()
{
var waitGraphs = await _lockManager.GetWaitGraphsAsync();
var cycles = DetectCycles(waitGraphs);
if (cycles.Any())
{
var deadlocks = cycles
.Where(c => c.WaitTime > _maxWaitTime)
.ToList();
if (deadlocks.Any())
{
foreach (var deadlock in deadlocks)
{
OnDeadlockDetected?.Invoke(new DeadlockInfo
{
Id = Guid.NewGuid(),
InvolvedModules = deadlock.Nodes.Select(n => n.ModuleId).ToList(),
LockedResources = deadlock.Edges.Select(e => e.ResourceId).ToList(),
Duration = deadlock.WaitTime,
WaitGraph = deadlock
});
}
return HealthCheckResult.Unhealthy($"{deadlocks.Count} deadlocks detected");
}
return HealthCheckResult.Degraded($"{cycles.Count} potential deadlocks");
}
return HealthCheckResult.Healthy();
}
private List DetectCycles(WaitGraph graph)
{
// Implement cycle detection algorithm (e.g., DFS with color marking)
var cycles = new List();
var visited = new HashSet();
var recursionStack = new HashSet();
foreach (var node in graph.Nodes)
{
if (!visited.Contains(node.Id))
{
var cyclesFromNode = DfsDetectCycles(
node, graph, visited, recursionStack, new List()
);
cycles.AddRange(cyclesFromNode);
}
}
return cycles;
}
}
// Performance Baseline and Anomaly Detection
public class PerformanceWatchdog : IWatchdog
{
private readonly TimeSpan _baselineWindow;
private readonly double _deviationThreshold;
private readonly Dictionary _baselines;
public async Task CheckHealthAsync()
{
var currentMetrics = await CollectCurrentMetrics();
var anomalies = new List();
foreach (var metric in currentMetrics)
{
if (_baselines.TryGetValue(metric.Name, out var baseline))
{
var deviation = CalculateDeviation(metric.Value, baseline);
if (Math.Abs(deviation) > _deviationThreshold)
{
anomalies.Add(new PerformanceAnomaly
{
MetricName = metric.Name,
ExpectedValue = baseline.Mean,
ActualValue = metric.Value,
Deviation = deviation,
Type = ClassifyAnomaly(metric, baseline, deviation)
});
}
}
}
if (anomalies.Any())
{
foreach (var anomaly in anomalies)
{
OnAnomalyDetected?.Invoke(anomaly);
}
return HealthCheckResult.Degraded(
$"{anomalies.Count} performance anomalies detected"
);
}
return HealthCheckResult.Healthy();
}
private AnomalyType ClassifyAnomaly(
Metric metric,
MetricBaseline baseline,
double deviation)
{
// Classify based on metric type and deviation pattern
if (metric.Name.Contains("latency") && deviation > 0)
return AnomalyType.LatencySpike;
if (metric.Name.Contains("throughput") && deviation < 0)
return AnomalyType.ThroughputDrop;
if (metric.Name.Contains("error") && deviation > 0)
return AnomalyType.ErrorRateIncrease;
return AnomalyType.Unknown;
}
}
from nexus_sdk import Module
from typing import Dict, List, Optional, Callable
from dataclasses import dataclass
from datetime import datetime, timedelta
import asyncio
import psutil
import threading
from collections import defaultdict
import statistics
@dataclass
class DeadlockInfo:
id: str
involved_modules: List[str]
locked_resources: List[str]
duration: timedelta
wait_graph: Dict
@dataclass
class ResourceAlert:
resource_type: str
module_id: str
current_value: float
threshold: float
timestamp: datetime
class AdvancedWatchdogModule(Module):
def __init__(self):
super().__init__()
self.watchdogs = {}
self.recovery_strategies = []
self.intervention_rules = {}
self._lock = threading.RLock()
self._resource_limits = {}
async def on_initialize(self):
# Configure watchdog hierarchy
await self._configure_watchdogs()
# Start monitoring tasks
asyncio.create_task(self._run_watchdog_loop())
# Subscribe to system events
await self.messages.subscribe("module.*", self._handle_module_event)
async def _configure_watchdogs(self):
# Process Health Watchdog
self.watchdogs['process'] = ProcessHealthWatchdog(
check_interval=5, # seconds
timeout=30,
on_unresponsive=self._handle_unresponsive_process
)
# Resource Watchdog
self.watchdogs['resource'] = ResourceWatchdog(
cpu_threshold=90,
memory_threshold=85,
io_threshold=80,
check_interval=10,
on_threshold_exceeded=self._handle_resource_exceeded
)
# Deadlock Watchdog
self.watchdogs['deadlock'] = DeadlockWatchdog(
detection_interval=30,
max_wait_time=120,
on_deadlock_detected=self._handle_deadlock
)
# Performance Watchdog
self.watchdogs['performance'] = PerformanceWatchdog(
baseline_window=3600, # 1 hour
deviation_threshold=2.0,
on_anomaly_detected=self._handle_performance_anomaly
)
# Cascade Failure Watchdog
self.watchdogs['cascade'] = CascadeFailureWatchdog(
failure_threshold=3,
time_window=60,
on_cascade_detected=self._handle_cascade_failure
)
async def _run_watchdog_loop(self):
"""Main watchdog monitoring loop"""
while not self.shutdown_requested:
try:
# Run all watchdog checks
results = await asyncio.gather(
*[w.check_health() for w in self.watchdogs.values()],
return_exceptions=True
)
# Process results
for i, result in enumerate(results):
watchdog_name = list(self.watchdogs.keys())[i]
if isinstance(result, Exception):
self.logger.error(
f"Watchdog {watchdog_name} failed: {result}"
)
elif result.needs_intervention:
await self._coordinate_intervention(
watchdog_name, result
)
await asyncio.sleep(1)
except Exception as e:
self.logger.error(f"Watchdog loop error: {e}")
async def _handle_deadlock(self, deadlock: DeadlockInfo):
"""Handle detected deadlock"""
self.logger.critical(f"Deadlock detected: {deadlock.id}")
# Log to audit system
await self.audit.security_event("DeadlockDetected") \
.with_property("involved_modules", deadlock.involved_modules) \
.with_property("locked_resources", deadlock.locked_resources) \
.with_property("duration_seconds", deadlock.duration.total_seconds()) \
.record()
# Try resolution strategies
strategies = [
self._timeout_oldest_transaction,
self._rollback_transactions,
self._restart_modules,
self._escalate_to_operator
]
for strategy in strategies:
try:
resolved = await strategy(deadlock)
if resolved:
self.logger.info(f"Deadlock resolved using {strategy.__name__}")
await self.messages.publish("watchdog.deadlock.resolved", {
"deadlock_id": deadlock.id,
"strategy": strategy.__name__,
"timestamp": datetime.utcnow().isoformat()
})
return
except Exception as e:
self.logger.error(f"Strategy {strategy.__name__} failed: {e}")
# Emergency response if all strategies fail
await self._emergency_shutdown(deadlock)
async def _handle_resource_exceeded(self, alert: ResourceAlert):
"""Handle resource threshold violations"""
self.logger.warning(
f"Resource threshold exceeded: {alert.resource_type} "
f"at {alert.current_value}% for {alert.module_id}"
)
# Apply resource capping based on type
if alert.resource_type == "cpu":
await self._apply_cpu_throttling(alert)
elif alert.resource_type == "memory":
await self._apply_memory_pressure(alert)
elif alert.resource_type == "io":
await self._apply_io_rate_limiting(alert)
# Record metrics
self.metrics.gauge(
f"watchdog.resource.{alert.resource_type}_usage",
alert.current_value,
labels={"module": alert.module_id}
)
async def _apply_cpu_throttling(self, alert: ResourceAlert):
"""Apply CPU throttling to runaway process"""
# Calculate throttle percentage
throttle_percent = min(
(alert.current_value - alert.threshold) * 2,
50 # Max 50% throttling
)
# Apply throttling
await self.messages.publish("module.throttle", {
"module_id": alert.module_id,
"resource": "cpu",
"throttle_percent": throttle_percent,
"duration": 60 # seconds
})
# Set up progressive throttling if needed
self._resource_limits[alert.module_id] = {
"cpu_limit": 100 - throttle_percent,
"applied_at": datetime.utcnow(),
"expires_at": datetime.utcnow() + timedelta(minutes=5)
}
async def _handle_performance_anomaly(self, anomaly):
"""Handle detected performance anomalies"""
self.logger.warning(f"Performance anomaly: {anomaly}")
# Collect diagnostic data
diagnostics = await self._collect_diagnostics(anomaly)
# Determine intervention level
intervention_level = self._determine_intervention_level(anomaly)
# Apply intervention
interventions = {
"monitor": self._record_anomaly,
"adjust": self._apply_performance_tuning,
"degrade": self._enable_degraded_mode,
"restart": self._restart_components
}
intervention_func = interventions.get(intervention_level)
if intervention_func:
await intervention_func(anomaly, diagnostics)
async def _handle_cascade_failure(self, cascade_info):
"""Prevent cascade failures from spreading"""
self.logger.critical(f"Cascade failure detected: {cascade_info}")
# Immediately isolate affected modules
for module_id in cascade_info.at_risk_modules:
await self._isolate_module(module_id)
# Apply circuit breakers
for connection in cascade_info.critical_connections:
await self._apply_circuit_breaker(connection)
# Notify system
await self.messages.publish("watchdog.cascade.detected", {
"failed_modules": cascade_info.failed_modules,
"at_risk_modules": cascade_info.at_risk_modules,
"isolation_applied": True,
"timestamp": datetime.utcnow().isoformat()
})
class DeadlockWatchdog:
"""Detects and reports deadlocks in the system"""
def __init__(self, detection_interval, max_wait_time, on_deadlock_detected):
self.detection_interval = detection_interval
self.max_wait_time = max_wait_time
self.on_deadlock_detected = on_deadlock_detected
self.lock_graph = defaultdict(list)
self.wait_times = {}
async def check_health(self):
"""Check for deadlocks in the system"""
# Build wait-for graph
wait_graph = await self._build_wait_graph()
# Detect cycles using DFS
cycles = self._detect_cycles(wait_graph)
# Check if any cycles exceed max wait time
deadlocks = []
for cycle in cycles:
wait_time = self._calculate_cycle_wait_time(cycle)
if wait_time > self.max_wait_time:
deadlock = DeadlockInfo(
id=f"deadlock_{datetime.utcnow().timestamp()}",
involved_modules=cycle['modules'],
locked_resources=cycle['resources'],
duration=timedelta(seconds=wait_time),
wait_graph=cycle['graph']
)
deadlocks.append(deadlock)
if self.on_deadlock_detected:
await self.on_deadlock_detected(deadlock)
return WatchdogResult(
healthy=len(deadlocks) == 0,
needs_intervention=len(deadlocks) > 0,
details={"deadlock_count": len(deadlocks)}
)
def _detect_cycles(self, graph):
"""Detect cycles in wait-for graph using DFS"""
visited = set()
rec_stack = set()
cycles = []
def dfs(node, path):
visited.add(node)
rec_stack.add(node)
path.append(node)
for neighbor in graph.get(node, []):
if neighbor not in visited:
if dfs(neighbor, path.copy()):
return True
elif neighbor in rec_stack:
# Found a cycle
cycle_start = path.index(neighbor)
cycle = path[cycle_start:] + [neighbor]
cycles.append({
'modules': cycle,
'resources': self._get_cycle_resources(cycle),
'graph': {n: graph[n] for n in cycle}
})
rec_stack.remove(node)
return False
for node in graph:
if node not in visited:
dfs(node, [])
return cycles
class PerformanceWatchdog:
"""Monitors performance metrics and detects anomalies"""
def __init__(self, baseline_window, deviation_threshold, on_anomaly_detected):
self.baseline_window = baseline_window
self.deviation_threshold = deviation_threshold
self.on_anomaly_detected = on_anomaly_detected
self.baselines = {}
self.metric_history = defaultdict(list)
async def check_health(self):
"""Check for performance anomalies"""
current_metrics = await self._collect_current_metrics()
anomalies = []
for metric_name, value in current_metrics.items():
# Update history
self.metric_history[metric_name].append({
'value': value,
'timestamp': datetime.utcnow()
})
# Clean old data
cutoff = datetime.utcnow() - timedelta(seconds=self.baseline_window)
self.metric_history[metric_name] = [
m for m in self.metric_history[metric_name]
if m['timestamp'] > cutoff
]
# Calculate baseline
if len(self.metric_history[metric_name]) > 10:
values = [m['value'] for m in self.metric_history[metric_name]]
mean = statistics.mean(values)
stdev = statistics.stdev(values) if len(values) > 1 else 0
# Check for anomaly
if stdev > 0:
z_score = abs((value - mean) / stdev)
if z_score > self.deviation_threshold:
anomaly = {
'metric': metric_name,
'value': value,
'expected': mean,
'z_score': z_score,
'type': self._classify_anomaly(metric_name, value, mean)
}
anomalies.append(anomaly)
if self.on_anomaly_detected:
await self.on_anomaly_detected(anomaly)
return WatchdogResult(
healthy=len(anomalies) == 0,
needs_intervention=len(anomalies) > 0,
details={'anomaly_count': len(anomalies)}
)
class CascadeFailureWatchdog:
"""Detects and prevents cascade failures"""
def __init__(self, failure_threshold, time_window, on_cascade_detected):
self.failure_threshold = failure_threshold
self.time_window = time_window
self.on_cascade_detected = on_cascade_detected
self.failure_history = defaultdict(list)
self.dependency_graph = {}
async def check_health(self):
"""Check for cascade failure patterns"""
# Clean old failure records
cutoff = datetime.utcnow() - timedelta(seconds=self.time_window)
for module_id in list(self.failure_history.keys()):
self.failure_history[module_id] = [
f for f in self.failure_history[module_id]
if f['timestamp'] > cutoff
]
# Detect cascade patterns
cascade_risks = []
for module_id, failures in self.failure_history.items():
if len(failures) >= self.failure_threshold:
# Check downstream dependencies
at_risk = self._identify_at_risk_modules(module_id)
if at_risk:
cascade_info = {
'failed_modules': [module_id],
'at_risk_modules': at_risk,
'failure_count': len(failures),
'critical_connections': self._get_critical_connections(
module_id, at_risk
)
}
cascade_risks.append(cascade_info)
if self.on_cascade_detected:
await self.on_cascade_detected(cascade_info)
return WatchdogResult(
healthy=len(cascade_risks) == 0,
needs_intervention=len(cascade_risks) > 0,
details={'cascade_risk_count': len(cascade_risks)}
)
#include <nexus/module.hpp>
#include <chrono>
#include <thread>
#include <atomic>
#include <unordered_map>
#include <mutex>
#include <condition_variable>
class AdvancedWatchdogModule : public nexus::ModuleBase {
private:
struct WatchdogConfig {
std::chrono::seconds check_interval{5};
std::chrono::seconds timeout{30};
double cpu_threshold{90.0};
double memory_threshold{85.0};
size_t deadlock_check_interval{30};
};
struct DeadlockInfo {
std::string id;
std::vector<std::string> involved_modules;
std::vector<std::string> locked_resources;
std::chrono::duration<double> duration;
std::unordered_map<std::string, std::vector<std::string>> wait_graph;
};
WatchdogConfig config_;
std::unordered_map<std::string, std::unique_ptr<IWatchdog>> watchdogs_;
std::atomic<bool> running_{false};
std::vector<std::thread> watchdog_threads_;
public:
AdvancedWatchdogModule() : ModuleBase("advanced-watchdog") {
configure_watchdogs();
}
async_task<void> on_initialize() override {
// Start all watchdog threads
running_ = true;
// Process health watchdog
watchdog_threads_.emplace_back([this] {
run_process_watchdog();
});
// Resource watchdog
watchdog_threads_.emplace_back([this] {
run_resource_watchdog();
});
// Deadlock detection watchdog
watchdog_threads_.emplace_back([this] {
run_deadlock_watchdog();
});
// Performance anomaly watchdog
watchdog_threads_.emplace_back([this] {
run_performance_watchdog();
});
// Register health checks
health()->add_check("watchdog-system",
[this]() -> async_task<HealthCheckResult> {
co_return check_watchdog_health();
});
co_return;
}
async_task<void> on_shutdown() override {
running_ = false;
// Stop all watchdog threads
for (auto& thread : watchdog_threads_) {
if (thread.joinable()) {
thread.join();
}
}
co_return;
}
private:
void configure_watchdogs() {
// Configure individual watchdogs
watchdogs_["process"] = std::make_unique<ProcessHealthWatchdog>(
config_.check_interval,
config_.timeout
);
watchdogs_["resource"] = std::make_unique<ResourceWatchdog>(
config_.cpu_threshold,
config_.memory_threshold
);
watchdogs_["deadlock"] = std::make_unique<DeadlockWatchdog>(
config_.deadlock_check_interval
);
watchdogs_["performance"] = std::make_unique<PerformanceWatchdog>(
std::chrono::hours(1), // baseline window
2.0 // deviation threshold
);
}
void run_deadlock_watchdog() {
auto last_check = std::chrono::steady_clock::now();
while (running_) {
auto now = std::chrono::steady_clock::now();
if (now - last_check >= std::chrono::seconds(config_.deadlock_check_interval)) {
detect_deadlocks();
last_check = now;
}
std::this_thread::sleep_for(std::chrono::seconds(1));
}
}
void detect_deadlocks() {
// Build wait-for graph
auto wait_graph = build_wait_graph();
// Detect cycles using DFS
std::unordered_set<std::string> visited;
std::unordered_set<std::string> rec_stack;
std::vector<DeadlockInfo> deadlocks;
for (const auto& [node, _] : wait_graph) {
if (visited.find(node) == visited.end()) {
std::vector<std::string> path;
detect_cycles_dfs(node, wait_graph, visited, rec_stack, path, deadlocks);
}
}
// Handle detected deadlocks
for (const auto& deadlock : deadlocks) {
handle_deadlock(deadlock);
}
}
void detect_cycles_dfs(
const std::string& node,
const std::unordered_map<std::string, std::vector<std::string>>& graph,
std::unordered_set<std::string>& visited,
std::unordered_set<std::string>& rec_stack,
std::vector<std::string>& path,
std::vector<DeadlockInfo>& deadlocks) {
visited.insert(node);
rec_stack.insert(node);
path.push_back(node);
auto it = graph.find(node);
if (it != graph.end()) {
for (const auto& neighbor : it->second) {
if (visited.find(neighbor) == visited.end()) {
detect_cycles_dfs(neighbor, graph, visited, rec_stack, path, deadlocks);
} else if (rec_stack.find(neighbor) != rec_stack.end()) {
// Found a cycle
auto cycle_start = std::find(path.begin(), path.end(), neighbor);
if (cycle_start != path.end()) {
DeadlockInfo deadlock;
deadlock.id = generate_uuid();
deadlock.involved_modules.assign(cycle_start, path.end());
deadlock.involved_modules.push_back(neighbor);
deadlock.duration = calculate_wait_duration(deadlock.involved_modules);
// Only report if wait time exceeds threshold
if (deadlock.duration > std::chrono::minutes(2)) {
deadlocks.push_back(deadlock);
}
}
}
}
}
rec_stack.erase(node);
path.pop_back();
}
async_task<void> handle_deadlock(const DeadlockInfo& deadlock) {
logger()->critical("Deadlock detected: {}", deadlock.id);
// Log to audit system
co_await audit()->security_event("DeadlockDetected")
.with_property("involved_modules", deadlock.involved_modules)
.with_property("duration_ms",
std::chrono::duration_cast<std::chrono::milliseconds>(
deadlock.duration).count())
.record_async();
// Try resolution strategies
std::vector<std::function<async_task<bool>(const DeadlockInfo&)>> strategies = {
[this](const auto& dl) { return timeout_oldest_transaction(dl); },
[this](const auto& dl) { return rollback_transactions(dl); },
[this](const auto& dl) { return restart_modules(dl); }
};
for (const auto& strategy : strategies) {
try {
bool resolved = co_await strategy(deadlock);
if (resolved) {
logger()->info("Deadlock {} resolved", deadlock.id);
co_await messages()->publish("watchdog.deadlock.resolved", {
{"deadlock_id", deadlock.id},
{"timestamp", std::chrono::system_clock::now()}
});
co_return;
}
} catch (const std::exception& e) {
logger()->error("Deadlock resolution strategy failed: {}", e.what());
}
}
// Emergency shutdown if all strategies fail
co_await emergency_shutdown(deadlock);
}
void run_resource_watchdog() {
while (running_) {
auto resources = collect_resource_usage();
for (const auto& [module_id, usage] : resources) {
// Check CPU usage
if (usage.cpu_percent > config_.cpu_threshold) {
handle_resource_exceeded(ResourceAlert{
ResourceType::CPU,
module_id,
usage.cpu_percent,
config_.cpu_threshold
});
}
// Check memory usage
if (usage.memory_percent > config_.memory_threshold) {
handle_resource_exceeded(ResourceAlert{
ResourceType::Memory,
module_id,
usage.memory_percent,
config_.memory_threshold
});
}
}
std::this_thread::sleep_for(std::chrono::seconds(10));
}
}
async_task<void> handle_resource_exceeded(const ResourceAlert& alert) {
logger()->warn("Resource threshold exceeded: {} at {}% for {}",
to_string(alert.resource_type),
alert.current_value,
alert.module_id);
// Apply resource capping
switch (alert.resource_type) {
case ResourceType::CPU:
co_await apply_cpu_throttling(alert);
break;
case ResourceType::Memory:
co_await apply_memory_pressure(alert);
break;
case ResourceType::IO:
co_await apply_io_rate_limiting(alert);
break;
}
// Update metrics
metrics()->gauge(
fmt::format("watchdog.resource.{}_usage", to_string(alert.resource_type)),
alert.current_value,
{{"module", alert.module_id}}
);
}
void run_performance_watchdog() {
PerformanceBaseline baseline;
std::deque<MetricSnapshot> history;
const size_t history_size = 3600; // 1 hour of seconds
while (running_) {
auto snapshot = collect_performance_metrics();
history.push_back(snapshot);
// Maintain sliding window
if (history.size() > history_size) {
history.pop_front();
}
// Update baseline
if (history.size() >= 60) { // Need at least 1 minute of data
baseline = calculate_baseline(history);
// Detect anomalies
auto anomalies = detect_anomalies(snapshot, baseline);
for (const auto& anomaly : anomalies) {
handle_performance_anomaly(anomaly);
}
}
std::this_thread::sleep_for(std::chrono::seconds(1));
}
}
std::vector<PerformanceAnomaly> detect_anomalies(
const MetricSnapshot& current,
const PerformanceBaseline& baseline) {
std::vector<PerformanceAnomaly> anomalies;
// Check latency
double latency_z_score = calculate_z_score(
current.avg_latency_ms,
baseline.latency_mean,
baseline.latency_stddev
);
if (std::abs(latency_z_score) > 2.0) {
anomalies.push_back(PerformanceAnomaly{
AnomalyType::LatencySpike,
"latency",
current.avg_latency_ms,
baseline.latency_mean,
latency_z_score
});
}
// Check throughput
double throughput_z_score = calculate_z_score(
current.throughput,
baseline.throughput_mean,
baseline.throughput_stddev
);
if (throughput_z_score < -2.0) { // Significant drop
anomalies.push_back(PerformanceAnomaly{
AnomalyType::ThroughputDrop,
"throughput",
current.throughput,
baseline.throughput_mean,
throughput_z_score
});
}
return anomalies;
}
};
// Specialized watchdog for hung process detection
class ProcessHealthWatchdog : public IWatchdog {
private:
struct ProcessState {
std::chrono::steady_clock::time_point last_heartbeat;
std::atomic<bool> responsive{true};
size_t missed_heartbeats{0};
};
std::unordered_map<std::string, ProcessState> process_states_;
std::mutex state_mutex_;
std::chrono::seconds timeout_;
public:
ProcessHealthWatchdog(std::chrono::seconds check_interval,
std::chrono::seconds timeout)
: timeout_(timeout) {}
async_task<HealthCheckResult> check_health() override {
std::lock_guard<std::mutex> lock(state_mutex_);
auto now = std::chrono::steady_clock::now();
std::vector<std::string> unresponsive_modules;
for (auto& [module_id, state] : process_states_) {
auto elapsed = now - state.last_heartbeat;
if (elapsed > timeout_) {
state.responsive = false;
state.missed_heartbeats++;
unresponsive_modules.push_back(module_id);
// Trigger recovery after 3 missed heartbeats
if (state.missed_heartbeats >= 3) {
co_await trigger_recovery(module_id);
}
} else {
state.responsive = true;
state.missed_heartbeats = 0;
}
}
if (unresponsive_modules.empty()) {
co_return HealthCheckResult::healthy();
} else {
co_return HealthCheckResult::unhealthy(
fmt::format("{} modules unresponsive", unresponsive_modules.size())
);
}
}
void record_heartbeat(const std::string& module_id) {
std::lock_guard<std::mutex> lock(state_mutex_);
process_states_[module_id].last_heartbeat = std::chrono::steady_clock::now();
}
};
Heartbeat and Liveness Patterns
Implementing Reliable Heartbeats
Design heartbeat mechanisms that can distinguish between temporary glitches and real failures.
public class HeartbeatModule : ModuleBase
{
private readonly TimeSpan _heartbeatInterval = TimeSpan.FromSeconds(5);
private readonly TimeSpan _heartbeatTimeout = TimeSpan.FromSeconds(30);
private readonly Dictionary _trackers;
private Timer _heartbeatTimer;
protected override async Task OnInitializeAsync()
{
// Start heartbeat sender
_heartbeatTimer = new Timer(
async _ => await SendHeartbeatAsync(),
null,
TimeSpan.Zero,
_heartbeatInterval
);
// Subscribe to heartbeats from other modules
await Messages.SubscribeAsync("module.heartbeat", HandleHeartbeat);
// Register liveness probe
Health.AddCheck("heartbeat", async () =>
{
var failures = _trackers.Values
.Count(t => !t.IsAlive(_heartbeatTimeout));
if (failures == 0)
return HealthCheckResult.Healthy();
if (failures > _trackers.Count / 2)
return HealthCheckResult.Unhealthy($"{failures} modules not responding");
return HealthCheckResult.Degraded($"{failures} modules degraded");
});
}
private async Task SendHeartbeatAsync()
{
try
{
var heartbeat = new ModuleHeartbeat
{
ModuleId = ModuleId,
Timestamp = DateTime.UtcNow,
SequenceNumber = Interlocked.Increment(ref _sequenceNumber),
// Include diagnostic data
Diagnostics = new HeartbeatDiagnostics
{
CpuUsage = GetCurrentCpuUsage(),
MemoryUsage = GetCurrentMemoryUsage(),
ActiveTasks = GetActiveTaskCount(),
QueueDepth = GetMessageQueueDepth(),
LastError = GetLastError(),
Uptime = GetUptime()
}
};
await Messages.PublishAsync("module.heartbeat", heartbeat);
// Also send to watchdog directly for faster detection
await Messages.PublishAsync($"watchdog.heartbeat.{ModuleId}", heartbeat);
}
catch (Exception ex)
{
Logger.Error("Failed to send heartbeat", ex);
// Don't throw - heartbeat failure shouldn't crash the module
}
}
private class HeartbeatTracker
{
private readonly object _lock = new object();
private DateTime _lastHeartbeat;
private long _lastSequence;
private readonly Queue _heartbeatHistory;
public bool IsAlive(TimeSpan timeout)
{
lock (_lock)
{
return DateTime.UtcNow - _lastHeartbeat < timeout;
}
}
public void RecordHeartbeat(ModuleHeartbeat heartbeat)
{
lock (_lock)
{
// Check for sequence gaps (indicates lost heartbeats)
if (heartbeat.SequenceNumber != _lastSequence + 1 && _lastSequence != 0)
{
var gap = heartbeat.SequenceNumber - _lastSequence - 1;
Logger.Warning($"Detected {gap} missed heartbeats from {heartbeat.ModuleId}");
}
_lastHeartbeat = heartbeat.Timestamp;
_lastSequence = heartbeat.SequenceNumber;
// Maintain history for pattern analysis
_heartbeatHistory.Enqueue(heartbeat.Timestamp);
while (_heartbeatHistory.Count > 100)
{
_heartbeatHistory.Dequeue();
}
// Analyze heartbeat patterns
AnalyzeHeartbeatPattern();
}
}
private void AnalyzeHeartbeatPattern()
{
if (_heartbeatHistory.Count < 10) return;
var intervals = new List();
var timestamps = _heartbeatHistory.ToArray();
for (int i = 1; i < timestamps.Length; i++)
{
intervals.Add((timestamps[i] - timestamps[i-1]).TotalSeconds);
}
var avgInterval = intervals.Average();
var stdDev = Math.Sqrt(intervals.Select(x => Math.Pow(x - avgInterval, 2)).Average());
// Detect irregular heartbeats
if (stdDev > avgInterval * 0.5)
{
Logger.Warning($"Irregular heartbeat pattern detected: avg={avgInterval:F2}s, stddev={stdDev:F2}s");
}
}
}
}
Cascade Failure Prevention
Detecting and Stopping Failure Propagation
Implement patterns to detect when failures are spreading through the system and take preventive action.
public class CascadePreventionModule : ModuleBase
{
private readonly FailurePropagationDetector _detector;
private readonly IsolationController _isolationController;
private readonly Dictionary _dependencyGraph;
protected override async Task OnInitializeAsync()
{
// Build dependency graph
_dependencyGraph = await BuildDependencyGraphAsync();
// Configure failure detection
_detector = new FailurePropagationDetector
{
FailureThreshold = 3,
TimeWindow = TimeSpan.FromMinutes(1),
PropagationSpeed = 0.5 // failures per second
};
// Subscribe to failure events
await Messages.SubscribeAsync("module.failed", HandleModuleFailure);
await Messages.SubscribeAsync("module.error", HandleModuleError);
}
private async Task HandleModuleFailure(Message message)
{
var failure = message.GetPayload();
// Record failure
_detector.RecordFailure(failure.ModuleId, failure.Timestamp);
// Check for cascade pattern
var cascadeRisk = _detector.AnalyzeCascadeRisk(_dependencyGraph);
if (cascadeRisk.IsCascading)
{
Logger.Critical($"Cascade failure detected! Risk level: {cascadeRisk.RiskLevel}");
// Take preventive action based on risk level
switch (cascadeRisk.RiskLevel)
{
case CascadeRiskLevel.Low:
await ApplyRateLimiting(cascadeRisk.AtRiskModules);
break;
case CascadeRiskLevel.Medium:
await IsolateFailedModules(cascadeRisk.FailedModules);
break;
case CascadeRiskLevel.High:
await ActivateEmergencyMode(cascadeRisk);
break;
case CascadeRiskLevel.Critical:
await InitiateControlledShutdown(cascadeRisk);
break;
}
}
}
private async Task IsolateFailedModules(List moduleIds)
{
foreach (var moduleId in moduleIds)
{
// Apply circuit breaker to all connections
var connections = GetModuleConnections(moduleId);
foreach (var connection in connections)
{
await _isolationController.IsolateConnection(connection);
}
// Redirect traffic to healthy instances
await RedirectTraffic(moduleId);
// Notify system
await Messages.PublishAsync("cascade.module.isolated", new
{
ModuleId = moduleId,
Reason = "Cascade prevention",
IsolatedAt = DateTime.UtcNow
});
}
}
private async Task ActivateEmergencyMode(CascadeRisk risk)
{
Logger.Critical("Activating emergency mode to prevent system-wide failure");
// 1. Shed non-critical load
await ShedNonCriticalLoad();
// 2. Activate all circuit breakers
await ActivateAllCircuitBreakers();
// 3. Switch to degraded mode
await EnableSystemWideDegradedMode();
// 4. Alert operators
await AlertOperators(new EmergencyAlert
{
Type = "CascadeFailure",
Severity = "Critical",
AffectedModules = risk.FailedModules.Count,
AtRiskModules = risk.AtRiskModules.Count,
RecommendedActions = GenerateRecommendedActions(risk)
});
}
}
public class FailurePropagationDetector
{
private readonly Dictionary> _failureHistory;
private readonly object _lock = new object();
public CascadeRisk AnalyzeCascadeRisk(Dictionary dependencyGraph)
{
lock (_lock)
{
// Clean old records
CleanOldRecords();
// Identify failed modules
var failedModules = _failureHistory
.Where(kvp => kvp.Value.Count >= FailureThreshold)
.Select(kvp => kvp.Key)
.ToList();
if (failedModules.Count == 0)
return CascadeRisk.None;
// Calculate failure velocity
var failureVelocity = CalculateFailureVelocity();
// Identify at-risk modules using dependency graph
var atRiskModules = new HashSet();
foreach (var failed in failedModules)
{
var dependents = GetDependentModules(failed, dependencyGraph);
foreach (var dependent in dependents)
{
if (!failedModules.Contains(dependent))
{
atRiskModules.Add(dependent);
}
}
}
// Determine risk level
var riskLevel = DetermineRiskLevel(
failedModules.Count,
atRiskModules.Count,
failureVelocity
);
return new CascadeRisk
{
IsCascading = failureVelocity > PropagationSpeed,
RiskLevel = riskLevel,
FailedModules = failedModules,
AtRiskModules = atRiskModules.ToList(),
FailureVelocity = failureVelocity,
PredictedImpact = PredictImpact(failedModules, atRiskModules, dependencyGraph)
};
}
}
private double CalculateFailureVelocity()
{
var recentFailures = _failureHistory
.SelectMany(kvp => kvp.Value)
.Where(f => f.Timestamp > DateTime.UtcNow - TimeWindow)
.OrderBy(f => f.Timestamp)
.ToList();
if (recentFailures.Count < 2)
return 0;
var timeSpan = (recentFailures.Last().Timestamp - recentFailures.First().Timestamp).TotalSeconds;
return timeSpan > 0 ? recentFailures.Count / timeSpan : 0;
}
}
Intelligent Recovery Strategies
Context-Aware Recovery
Implement recovery strategies that adapt based on failure patterns and system state.
public class IntelligentRecoveryModule : ModuleBase
{
private readonly RecoveryOrchestrator _orchestrator;
private readonly FailureAnalyzer _analyzer;
private readonly Dictionary _recoveryHistory;
protected override async Task OnInitializeAsync()
{
_orchestrator = new RecoveryOrchestrator();
_analyzer = new FailureAnalyzer();
// Register recovery strategies
RegisterRecoveryStrategies();
// Subscribe to watchdog alerts
await Messages.SubscribeAsync("watchdog.alert.*", HandleWatchdogAlert);
}
private void RegisterRecoveryStrategies()
{
// Level 1: Gentle recovery
_orchestrator.RegisterStrategy(RecoveryLevel.Gentle, new IRecoveryStrategy[]
{
new ClearCacheStrategy(),
new ResetConnectionsStrategy(),
new GarbageCollectionStrategy()
});
// Level 2: Moderate recovery
_orchestrator.RegisterStrategy(RecoveryLevel.Moderate, new IRecoveryStrategy[]
{
new RestartThreadPoolStrategy(),
new ReloadConfigurationStrategy(),
new ReinitializeServicesStrategy()
});
// Level 3: Aggressive recovery
_orchestrator.RegisterStrategy(RecoveryLevel.Aggressive, new IRecoveryStrategy[]
{
new ProcessRestartStrategy(),
new StateResetStrategy(),
new DependencyRestartStrategy()
});
// Level 4: Emergency recovery
_orchestrator.RegisterStrategy(RecoveryLevel.Emergency, new IRecoveryStrategy[]
{
new IsolateAndRestartStrategy(),
new FailoverToBackupStrategy(),
new DisasterRecoveryStrategy()
});
}
private async Task HandleWatchdogAlert(Message message)
{
var alert = message.GetPayload();
// Analyze failure pattern
var analysis = _analyzer.AnalyzeFailure(alert, _recoveryHistory);
// Determine recovery strategy based on analysis
var strategy = DetermineRecoveryStrategy(analysis);
// Execute recovery
var result = await _orchestrator.ExecuteRecoveryAsync(
alert.ModuleId,
strategy,
new RecoveryContext
{
FailureType = analysis.FailureType,
FailureCount = analysis.FailureCount,
TimeSinceLastRecovery = analysis.TimeSinceLastRecovery,
SystemLoad = await GetSystemLoadAsync(),
Dependencies = await GetModuleDependenciesAsync(alert.ModuleId)
}
);
// Record recovery attempt
RecordRecoveryAttempt(alert.ModuleId, strategy, result);
// Adjust future strategies based on success
if (!result.Success)
{
await EscalateRecoveryStrategy(alert.ModuleId, strategy);
}
}
private RecoveryStrategy DetermineRecoveryStrategy(FailureAnalysis analysis)
{
// Learn from past recoveries
var history = _recoveryHistory.GetValueOrDefault(analysis.ModuleId);
if (history != null)
{
var successfulStrategies = history.GetSuccessfulStrategies(analysis.FailureType);
if (successfulStrategies.Any())
{
return successfulStrategies.First();
}
}
// Use heuristics based on failure type
return analysis.FailureType switch
{
FailureType.Timeout => RecoveryStrategy.RestartWithBackoff,
FailureType.OutOfMemory => RecoveryStrategy.MemoryCleanupAndRestart,
FailureType.Deadlock => RecoveryStrategy.ForceUnlockAndReset,
FailureType.ResourceExhaustion => RecoveryStrategy.ResourceReallocation,
FailureType.PerformanceDegradation => RecoveryStrategy.PerformanceTuning,
_ => RecoveryStrategy.StandardRestart
};
}
}
// Adaptive recovery orchestrator
public class RecoveryOrchestrator
{
private readonly Dictionary> _strategies;
private readonly IBackoffPolicy _backoffPolicy;
public async Task ExecuteRecoveryAsync(
string moduleId,
RecoveryStrategy strategy,
RecoveryContext context)
{
Logger.Info($"Starting recovery for {moduleId} using {strategy}");
// Determine recovery level based on context
var level = DetermineRecoveryLevel(context);
// Get strategies for this level
var strategies = _strategies[level];
// Execute strategies with backoff
var attempt = 0;
var backoff = TimeSpan.Zero;
foreach (var recoveryStrategy in strategies)
{
attempt++;
// Wait for backoff period
if (backoff > TimeSpan.Zero)
{
await Task.Delay(backoff);
}
try
{
// Check preconditions
if (!await recoveryStrategy.CanExecuteAsync(context))
{
continue;
}
// Execute recovery
var result = await recoveryStrategy.ExecuteAsync(moduleId, context);
if (result.Success)
{
Logger.Info($"Recovery successful using {recoveryStrategy.Name}");
return result;
}
// Calculate next backoff
backoff = _backoffPolicy.GetNextDelay(attempt);
}
catch (Exception ex)
{
Logger.Error($"Recovery strategy {recoveryStrategy.Name} failed", ex);
}
}
return RecoveryResult.Failed("All recovery strategies exhausted");
}
}
Real-World Watchdog Examples
Example 1: Manufacturing Process Watchdog
A watchdog system for critical manufacturing processes with safety interlocks.
public class ManufacturingWatchdogModule : ModuleBase
{
private readonly SafetyInterlockSystem _safetySystem;
private readonly ProcessMonitor _processMonitor;
private readonly EmergencyStopController _emergencyStop;
protected override async Task OnInitializeAsync()
{
// Initialize safety systems
_safetySystem = new SafetyInterlockSystem();
_emergencyStop = new EmergencyStopController();
// Configure process-specific watchdogs
ConfigureProcessWatchdogs();
// Start monitoring
_ = Task.Run(MonitorManufacturingProcess);
}
private void ConfigureProcessWatchdogs()
{
// Temperature watchdog for furnace
var tempWatchdog = new TemperatureWatchdog
{
CriticalHigh = 1200, // Celsius
WarningHigh = 1150,
CriticalLow = 800,
WarningLow = 850,
CheckInterval = TimeSpan.FromSeconds(1)
};
tempWatchdog.OnCriticalTemperature += HandleCriticalTemperature;
// Pressure watchdog for hydraulic systems
var pressureWatchdog = new PressureWatchdog
{
MaxPressure = 3000, // PSI
MinPressure = 1000,
RateOfChangeLimit = 100, // PSI per second
CheckInterval = TimeSpan.FromMilliseconds(100)
};
pressureWatchdog.OnPressureAnomaly += HandlePressureAnomaly;
// Motion watchdog for robotic arms
var motionWatchdog = new MotionWatchdog
{
MaxVelocity = 2.0, // meters per second
MaxAcceleration = 5.0, // meters per second squared
PositionTolerance = 0.001, // meters
SafetyBoundary = DefineRobotSafetyBoundary()
};
motionWatchdog.OnSafetyViolation += HandleSafetyViolation;
}
private async Task HandleCriticalTemperature(TemperatureAlert alert)
{
Logger.Critical($"Critical temperature alert: {alert.Temperature}°C");
// Immediate safety response
if (alert.Temperature > alert.CriticalHigh)
{
// Emergency cooling
await _safetySystem.ActivateEmergencyCooling();
// Reduce power
await _processMonitor.ReducePower(50); // 50% reduction
// Alert operators
await AlertOperators("CRITICAL: Furnace overtemperature", alert);
}
// Log for compliance
await Audit.SafetyEvent("CriticalTemperature")
.WithProperty("temperature", alert.Temperature)
.WithProperty("location", alert.SensorLocation)
.WithProperty("action", "EmergencyCooling")
.ForCompliance(ComplianceStandard.ISO_9001)
.RecordAsync();
}
private async Task HandleSafetyViolation(SafetyViolation violation)
{
Logger.Critical($"Safety violation: {violation.Type}");
switch (violation.Severity)
{
case SafetySeverity.Warning:
// Slow down operations
await _processMonitor.ReduceSpeed(25);
break;
case SafetySeverity.Critical:
// Stop motion in affected area
await _emergencyStop.StopZone(violation.Zone);
break;
case SafetySeverity.Emergency:
// Full emergency stop
await _emergencyStop.ActivateFullStop();
// Lock out equipment
await _safetySystem.LockOutTagOut(violation.EquipmentId);
break;
}
// Record incident
await RecordSafetyIncident(violation);
}
}
Example 2: Financial Trading Watchdog
A watchdog system for high-frequency trading with risk management.
public class TradingWatchdogModule : ModuleBase
{
private readonly RiskManager _riskManager;
private readonly LatencyMonitor _latencyMonitor;
private readonly AnomalyDetector _anomalyDetector;
private readonly CircuitBreaker _tradingCircuitBreaker;
protected override async Task OnInitializeAsync()
{
// Configure trading-specific watchdogs
ConfigureTradingWatchdogs();
// Subscribe to market data and trading events
await Messages.SubscribeAsync("market.data.*", ProcessMarketData);
await Messages.SubscribeAsync("trading.order.*", MonitorTradingActivity);
}
private void ConfigureTradingWatchdogs()
{
// Latency watchdog - critical for HFT
_latencyMonitor = new LatencyMonitor
{
MaxLatency = TimeSpan.FromMilliseconds(5),
WarningLatency = TimeSpan.FromMilliseconds(3),
MeasurementPoints = new[]
{
"market_data_receive",
"order_decision",
"order_send",
"order_acknowledge"
}
};
_latencyMonitor.OnLatencySpike += HandleLatencySpike;
// Risk exposure watchdog
var riskWatchdog = new RiskExposureWatchdog
{
MaxPositionSize = 1_000_000, // USD
MaxDrawdown = 0.02, // 2%
MaxDailyLoss = 50_000, // USD
ConcentrationLimit = 0.2 // 20% in single position
};
riskWatchdog.OnRiskLimitExceeded += HandleRiskLimitExceeded;
// Order anomaly watchdog
_anomalyDetector = new OrderAnomalyDetector
{
NormalOrderRate = 100, // orders per second
MaxOrderRate = 1000,
PriceDeviationThreshold = 0.005, // 0.5%
VolumeAnomalyMultiplier = 10
};
_anomalyDetector.OnAnomalyDetected += HandleTradingAnomaly;
}
private async Task HandleLatencySpike(LatencyAlert alert)
{
if (alert.Latency > alert.MaxLatency)
{
Logger.Critical($"Critical latency: {alert.Latency.TotalMilliseconds}ms at {alert.MeasurementPoint}");
// Switch to degraded mode
await EnableDegradedTradingMode();
// Reduce order rate
await _riskManager.SetMaxOrderRate(10); // Reduce to 10 orders/sec
// Alert trading desk
await Messages.PublishAsync("trading.alert.latency", alert);
}
}
private async Task HandleRiskLimitExceeded(RiskAlert alert)
{
Logger.Error($"Risk limit exceeded: {alert.LimitType} = {alert.CurrentValue}");
switch (alert.LimitType)
{
case RiskLimitType.Position:
// Freeze new positions
await _tradingCircuitBreaker.Open("position_limit");
// Start position reduction
await _riskManager.ReducePositions(alert.ReductionTarget);
break;
case RiskLimitType.Loss:
// Stop all trading
await _tradingCircuitBreaker.OpenAll();
// Close all positions
await _riskManager.CloseAllPositions();
// Lock trading until manual review
await LockTradingPending Review(alert);
break;
}
// Compliance reporting
await ReportRiskBreach(alert);
}
private async Task HandleTradingAnomaly(TradingAnomaly anomaly)
{
Logger.Warning($"Trading anomaly detected: {anomaly.Type}");
// Analyze anomaly pattern
var pattern = await _anomalyDetector.AnalyzePattern(anomaly);
if (pattern.IsMalicious)
{
// Potential market manipulation or system compromise
Logger.Critical("Potential malicious trading activity detected");
// Immediate shutdown
await EmergencyTradingShutdown();
// Snapshot state for forensics
await CaptureForensicSnapshot(anomaly);
// Alert compliance and security
await AlertComplianceAndSecurity(anomaly, pattern);
}
else if (pattern.IsSystemError)
{
// System malfunction
await HandleSystemMalfunction(anomaly);
}
}
}
Best Practices
Watchdog System Guidelines
- Layer Your Watchdogs: Use multiple levels of monitoring for defense in depth
- Fast Detection: Critical watchdogs should check frequently (seconds not minutes)
- Avoid False Positives: Use sliding windows and thresholds to prevent noise
- Clear Escalation: Define clear paths from detection to resolution
- Test Watchdog Failures: Ensure the system works when watchdogs themselves fail
- Resource Awareness: Watchdogs shouldn't consume excessive resources
- Audit Everything: Log all watchdog decisions for post-mortem analysis
- Human Override: Always provide manual intervention capabilities
- Recovery Testing: Regularly test all recovery strategies
- Learn and Adapt: Use ML to improve detection and recovery over time
System Packages
System packages allow you to bundle related modules together as cohesive, deployable units. This enables you to ship complete solutions, simplify deployment, and manage complex systems more effectively.
Why Use System Packages?
- Complete Solutions: Ship entire systems (monitoring, control, analytics) as single units
- Simplified Deployment: Deploy complex multi-module systems with one command
- Version Management: Version and update entire systems atomically
- Vendor Integration: ISVs can provide turnkey solutions with extension points
- Lifecycle Management: System-level hooks for initialization, health checks, and cleanup
Creating a System Package
System packages are defined using a package manifest file (package.yaml
) that describes the modules, their relationships, and system-level configuration.
# package.yaml - System Package Manifest
apiVersion: nexus/v1
kind: SystemPackage
metadata:
name: "monitoring-system"
version: "1.0.0"
vendor: "Your Company"
description: "Complete monitoring solution for industrial applications"
tags: ["monitoring", "alerts", "industrial"]
spec:
# System-wide defaults inherited by all modules
defaults:
resources:
memory: "256Mi"
cpu: 0.5
environment:
LOG_LEVEL: "INFO"
RETENTION_DAYS: "30"
# Module groups for logical organization
groups:
# Core monitoring infrastructure
core:
name: "core"
description: "Core monitoring modules"
required: true
modules:
- id: "metrics-collector"
type: "csharp"
source: "modules/metrics-collector"
replicas: 2
capabilities:
provides: ["metrics.collect"]
required: ["storage.write"]
- id: "alert-engine"
type: "python"
source: "modules/alert-engine"
dependencies: ["metrics-collector"]
capabilities:
provides: ["alerts.evaluate"]
# User interface modules
ui:
name: "ui"
description: "User interface components"
modules:
- id: "dashboard"
type: "csharp"
source: "modules/dashboard"
expose:
- port: 8080
protocol: "http"
public: true
# System lifecycle hooks
lifecycle:
preStart: "scripts/setup.sh"
postStart: "scripts/verify.sh"
healthCheck: "scripts/health.sh"
preStop: "scripts/backup.sh"
Building System Packages
Use the Nexus CLI to build your system package into a distributable .nxp
file:
# Build a system package
nexus package build ./my-system -o my-system-v1.0.0.nxp
# Validate package structure
nexus package validate ./my-system
# Build with additional metadata
nexus package build ./my-system \
-o my-system-v1.0.0.nxp \
--tag production \
--tag certified
Deploying System Packages
System packages can be deployed alongside individual modules in your application manifest:
# nexus-manifest.yaml
apiVersion: nexus/v1
kind: Application
metadata:
name: "Smart Factory"
version: "2.0.0"
# Deploy system packages
systemPackages:
# From local file
- name: "monitoring-system"
source: "packages/monitoring-v1.0.0.nxp"
config:
alertEmail: "ops@company.com"
moduleOverrides:
"metrics-collector":
replicas: 3 # Override default replica count
# From package registry
- name: "control-system"
source: "registry://nexus-hub/control:2.0.0"
# Deploy individual modules that integrate with packages
modules:
- id: "custom-integration"
name: "Custom Integration"
language: "python"
path: "./modules/custom"
dependencies:
# Reference modules from system packages
- "monitoring-system.metrics-collector"
- "control-system.plc-interface"
Module References in System Packages
Modules within system packages are namespaced to prevent conflicts. Reference them using the format package-name.module-id
:
public class CustomModule : IModule
{
public async Task InitializeAsync(IModuleContext context)
{
// Subscribe to events from a system package module
await context.MessageBus.SubscribeAsync(
"monitoring-system.metrics-collector.data",
HandleMetricsData);
// Send request to a system package module
var response = await context.MessageBus.RequestAsync(
"control-system.plc-interface.read",
new { register = "D100" });
}
}
class CustomModule:
async def initialize(self, context):
# Subscribe to events from a system package module
await context["message_bus"].subscribe(
"monitoring-system.metrics-collector.data",
self.handle_metrics_data)
# Send request to a system package module
response = await context["message_bus"].request(
"control-system.plc-interface.read",
{"register": "D100"})
Package Lifecycle Management
System packages support lifecycle hooks that execute at the system level, allowing you to perform initialization, validation, and cleanup operations:
Available Lifecycle Hooks
Hook | When Executed | Common Use Cases |
---|---|---|
preStart |
Before any modules start | Database setup, configuration validation |
postStart |
After all modules are running | System verification, initial data loading |
healthCheck |
Periodically during runtime | System-level health validation |
preStop |
Before stopping modules | Data backup, graceful shutdown preparation |
postStop |
After all modules stopped | Cleanup, resource deallocation |
preUpgrade |
Before system upgrade | Data migration, compatibility checks |
postUpgrade |
After system upgrade | Verification, cache warming |
Module Clustering and Replication
System packages support module clustering for high availability and load distribution. Modules can be replicated across multiple instances with automatic load balancing and failover:
# package.yaml - Clustered System Package
apiVersion: nexus/v1
kind: SystemPackage
metadata:
name: "high-availability-control"
version: "2.0.0"
spec:
groups:
controllers:
name: "controllers"
modules:
- id: "primary-controller"
type: "csharp"
source: "modules/controller"
# Enable clustering with 3 replicas
replicas: 3
clustering:
mode: "active-active" # active-active | active-passive | primary-secondary
synchronization: "state" # state | none
partitioning: "hash" # hash | range | custom
- id: "data-processor"
type: "python"
source: "modules/processor"
replicas: 5
clustering:
mode: "active-active"
loadBalancing:
strategy: "round-robin" # round-robin | least-connections | weighted
healthCheck:
interval: 10s
timeout: 5s
- id: "state-manager"
type: "csharp"
source: "modules/state"
replicas: 1 # Single instance with hot standby
clustering:
mode: "active-passive"
failover:
automatic: true
timeout: 30s
Clustering Modes
Active-Active
All replicas handle requests simultaneously. Best for stateless processing and high throughput.
- Load distributed across all instances
- Automatic failover on instance failure
- Suitable for read-heavy workloads
Active-Passive
One active instance with hot standbys. Ideal for stateful services requiring consistency.
- Single active instance at a time
- Automatic promotion on failure
- Zero-downtime failover
Primary-Secondary
Primary handles writes, secondaries handle reads. Perfect for read-heavy applications.
- Write operations go to primary
- Read operations distributed to secondaries
- Automatic primary election
State Synchronization
For stateful modules, Nexus provides built-in state synchronization across cluster members:
public class ClusteredController : IModule
{
private IClusterContext _cluster;
public async Task InitializeAsync(IModuleContext context)
{
// Get cluster context for distributed operations
_cluster = context.GetService<IClusterContext>();
// Register for cluster events
_cluster.OnMemberJoined += HandleMemberJoined;
_cluster.OnMemberLeft += HandleMemberLeft;
_cluster.OnLeaderElected += HandleLeaderElection;
// Distributed state operations
await _cluster.State.SetAsync("config", new SystemConfig());
var config = await _cluster.State.GetAsync<SystemConfig>("config");
// Distributed locking for critical sections
using (var lock = await _cluster.AcquireLockAsync("critical-resource"))
{
// Exclusive access across cluster
await PerformCriticalOperation();
}
// Check cluster status
var members = _cluster.GetMembers();
var isLeader = _cluster.IsLeader;
Console.WriteLine($"Cluster has {members.Count} members, I am {(isLeader ? "leader" : "follower")}");
}
private async Task HandleLeaderElection(LeaderElectionEvent evt)
{
if (evt.NewLeaderId == _cluster.MemberId)
{
// This instance is now the leader
await InitializeLeaderResponsibilities();
}
}
}
class ClusteredController:
def __init__(self):
self.cluster = None
async def initialize(self, context):
# Get cluster context for distributed operations
self.cluster = context.get_service("cluster")
# Register for cluster events
self.cluster.on_member_joined(self.handle_member_joined)
self.cluster.on_member_left(self.handle_member_left)
self.cluster.on_leader_elected(self.handle_leader_election)
# Distributed state operations
await self.cluster.state.set("config", {"version": "2.0"})
config = await self.cluster.state.get("config")
# Distributed locking for critical sections
async with self.cluster.acquire_lock("critical-resource"):
# Exclusive access across cluster
await self.perform_critical_operation()
# Check cluster status
members = self.cluster.get_members()
is_leader = self.cluster.is_leader
print(f"Cluster has {len(members)} members, I am {'leader' if is_leader else 'follower'}")
async def handle_leader_election(self, event):
if event.new_leader_id == self.cluster.member_id:
# This instance is now the leader
await self.initialize_leader_responsibilities()
class ClusteredController : public nexus::ModuleBase {
private:
std::shared_ptr cluster_;
public:
async_result initialize(nexus::IModuleContext* context) override {
// Get cluster context for distributed operations
cluster_ = context->get_service();
// Register for cluster events
cluster_->on_member_joined([this](const MemberInfo& member) {
co_await handle_member_joined(member);
});
cluster_->on_member_left([this](const MemberInfo& member) {
co_await handle_member_left(member);
});
cluster_->on_leader_elected([this](const LeaderElectionEvent& evt) {
co_await handle_leader_election(evt);
});
// Distributed state operations
SystemConfig config{"2.0"};
co_await cluster_->state()->set("config", config);
auto stored_config = co_await cluster_->state()->get("config");
// Distributed locking for critical sections
{
auto lock = co_await cluster_->acquire_lock("critical-resource");
// Exclusive access across cluster
co_await perform_critical_operation();
} // Lock automatically released
// Check cluster status
auto members = cluster_->get_members();
auto is_leader = cluster_->is_leader();
logger()->info("Cluster has {} members, I am {}",
members.size(), is_leader ? "leader" : "follower");
co_return;
}
private:
async_result handle_leader_election(const LeaderElectionEvent& evt) {
if (evt.new_leader_id == cluster_->member_id()) {
// This instance is now the leader
co_await initialize_leader_responsibilities();
}
co_return;
}
};
classdef ClusteredController < nexus.Module
properties (Access = private)
cluster
end
methods
function obj = ClusteredController()
obj.cluster = [];
end
function initialize(obj, context)
% Get cluster context for distributed operations
obj.cluster = context.getService('cluster');
% Register for cluster events
obj.cluster.OnMemberJoined = @obj.handleMemberJoined;
obj.cluster.OnMemberLeft = @obj.handleMemberLeft;
obj.cluster.OnLeaderElected = @obj.handleLeaderElection;
% Distributed state operations
config = struct('version', '2.0');
obj.cluster.State.set('config', config);
storedConfig = obj.cluster.State.get('config');
% Distributed locking for critical sections
lock = obj.cluster.acquireLock('critical-resource');
try
% Exclusive access across cluster
obj.performCriticalOperation();
catch ME
lock.release();
rethrow(ME);
end
lock.release();
% Check cluster status
members = obj.cluster.getMembers();
isLeader = obj.cluster.isLeader();
fprintf('Cluster has %d members, I am %s\n', ...
length(members), ...
conditional(isLeader, 'leader', 'follower'));
end
function handleLeaderElection(obj, event)
if strcmp(event.NewLeaderId, obj.cluster.MemberId)
% This instance is now the leader
obj.initializeLeaderResponsibilities();
end
end
function performCriticalOperation(obj)
% Critical operation implementation
obj.Logger.info('Performing critical operation with exclusive lock');
end
end
end
function result = conditional(condition, trueValue, falseValue)
if condition
result = trueValue;
else
result = falseValue;
end
end
// LabVIEW G-Code Representation for Clustered Controller
// ClusteredController.vi
[Initialize Method]
├─ Get Service from Context
│ ├─ Service Name: "cluster"
│ └─ Output: Cluster Context Reference
│
├─ Register Event Callbacks
│ ├─ Create Event Registration Node
│ ├─ Register "Member Joined" → HandleMemberJoined.vi
│ ├─ Register "Member Left" → HandleMemberLeft.vi
│ └─ Register "Leader Elected" → HandleLeaderElection.vi
│
├─ Distributed State Operations
│ ├─ Create Config Cluster
│ │ └─ Version: "2.0"
│ ├─ Cluster State Set
│ │ ├─ Key: "config"
│ │ └─ Value: Config Cluster
│ ├─ Cluster State Get
│ │ ├─ Key: "config"
│ │ └─ Output: Stored Config
│ └─ Display Config Version
│
├─ Distributed Locking Section
│ ├─ Acquire Lock
│ │ ├─ Resource: "critical-resource"
│ │ └─ Output: Lock Reference
│ ├─ Try-Catch Structure
│ │ ├─ Try Case:
│ │ │ └─ Perform Critical Operation SubVI
│ │ └─ Catch Case:
│ │ ├─ Release Lock
│ │ └─ Rethrow Error
│ └─ Release Lock (Normal Exit)
│
└─ Check Cluster Status
├─ Get Members → Member Array
├─ Is Leader? → Boolean
├─ Array Size → Member Count
└─ Format String
├─ Template: "Cluster has %d members, I am %s"
├─ Arg1: Member Count
└─ Arg2: Select ("leader" if True, "follower" if False)
[Handle Leader Election Event SubVI]
├─ Event Input Terminal
├─ Compare Strings
│ ├─ String 1: Event.NewLeaderId
│ └─ String 2: Cluster.MemberId
├─ Case Structure (Equal?)
│ └─ True Case:
│ └─ Initialize Leader Responsibilities SubVI
└─ Event Handled Output
[Perform Critical Operation SubVI]
├─ Log Message
│ ├─ Level: Info
│ └─ Message: "Performing critical operation with exclusive lock"
├─ Critical Operations
│ ├─ Read Shared State
│ ├─ Modify Data
│ └─ Write Back State
└─ Return Success
Load Balancing Strategies
Configure how requests are distributed across cluster members:
clustering:
loadBalancing:
strategy: "weighted"
weights:
# Assign weights based on instance capabilities
- instance: 0
weight: 100 # Primary instance gets more traffic
- instance: 1
weight: 50
- instance: 2
weight: 50
stickySession:
enabled: true
ttl: 300s
key: "client-id" # Route same clients to same instance
Module Overrides
When deploying system packages, you can override module-specific settings without modifying the package itself:
systemPackages:
- name: "monitoring-system"
source: "monitoring-v1.0.0.nxp"
# Override specific modules
moduleOverrides:
"metrics-collector":
replicas: 5 # Scale up for production
resources:
memory: "1Gi" # Increase memory allocation
cpu: 2.0
config:
collectionInterval: 5 # More frequent collection
"alert-engine":
config:
alertThresholds:
cpu: 90 # Higher threshold for production
memory: 95
"custom-notifier":
enabled: false # Disable optional module
Package Distribution
System packages can be distributed through multiple channels:
Local Files
Distribute .nxp
files directly with your application
source: "packages/my-system-v1.0.0.nxp"
Package Registry
Publish to a central registry for easy sharing
source: "registry://nexus-hub/my-system:1.0.0"
HTTP/HTTPS
Download packages from web servers
source: "https://packages.company.com/my-system-v1.0.0.nxp"
Best Practices for System Packages
- Modular Design: Keep modules focused on specific responsibilities
- Clear Dependencies: Explicitly declare all inter-module dependencies
- Version Compatibility: Test packages across different Nexus versions
- Extension Points: Design packages with customization in mind
- Resource Limits: Set appropriate resource constraints for each module
- Documentation: Include comprehensive documentation with your package
- Security: Sign packages for production deployments
- Testing: Test the entire system as a unit, not just individual modules
Time-Sensitive Networking (TSN)
NEXUS-1 provides comprehensive support for Time-Sensitive Networking (TSN), enabling deterministic, real-time communication for industrial automation applications. TSN ensures guaranteed latency bounds, precise time synchronization, and traffic prioritization essential for mission-critical control systems.
Why TSN?
- Deterministic Latency: Guaranteed sub-millisecond communication for control loops
- Time Synchronization: Network-wide clock sync with microsecond precision
- Traffic Prioritization: 7 priority levels with hardware-enforced QoS
- Mixed Traffic: Control, monitoring, and IT traffic on same network
- Standards-Based: IEEE 802.1 TSN standards ensure interoperability
Configuring TSN
Enable TSN as your message bus transport in your application manifest:
# nexus-manifest.yaml
runtime:
messaging:
transport: tsn
tsn:
networkInterface: "eth0"
domainNumber: 0
timeRole: slave # slave | master | boundary
profile: Industrial # Default | Industrial | Automotive | Power
enableHardwareTimestamping: true
enableStreamReservation: true
maxClockOffsetMicroseconds: 100
syncIntervalMs: 125
# Define traffic classes
trafficClasses:
- name: "control"
class: Critical
vlanId: 100
priority: 7
maxLatencyMicroseconds: 500
gateControlList:
- intervalNanoseconds: 500000 # 500µs window
gateStates: 0x80 # Only highest priority
- intervalNanoseconds: 500000
gateStates: 0xFF # All priorities
- name: "monitoring"
class: ExcellentEffort
vlanId: 101
priority: 5
maxLatencyMicroseconds: 2000
- name: "standard"
class: Standard
vlanId: 102
priority: 3
maxLatencyMicroseconds: 10000
Module TSN Requirements
Specify TSN requirements for individual modules to ensure they get appropriate real-time guarantees:
modules:
- id: "motion-controller"
name: "Motion Controller"
language: "csharp"
assembly: "MotionControl.dll"
critical: true
# TSN-specific requirements
tsnRequirements:
trafficClass: Critical # Map to defined traffic class
maxLatency: "500us" # Maximum acceptable latency
timeSynchronization: true # Require time sync
streamReservation:
bandwidth: "10Mbps" # Reserved bandwidth
maxFrameSize: 1500 # Maximum Ethernet frame size
- id: "hmi-interface"
name: "HMI Interface"
language: "python"
tsnRequirements:
trafficClass: Standard
maxLatency: "50ms"
timeSynchronization: false # HMI doesn't need precise sync
Using TSN in Modules
Modules can leverage TSN capabilities through the enhanced module context:
public class RealTimeController : IModule
{
private ITsnModuleContext _tsnContext;
private TsnStreamHandle _controlStream;
private Timer _controlTimer;
public async Task<ModuleStatus> InitializeAsync(IModuleContext context)
{
// Check if TSN is available
_tsnContext = context as ITsnModuleContext;
if (_tsnContext == null)
{
context.Logger.Warning("TSN not available - running in compatibility mode");
return ModuleStatus.Initialized;
}
// Reserve a TSN stream for deterministic communication
_controlStream = await _tsnContext.ReserveStreamAsync(new TsnStreamConfig
{
StreamId = $"control-{Id}",
TrafficClass = TsnTrafficClass.Critical,
MaxLatencyMicroseconds = 500, // 500µs max latency
MaxJitterMicroseconds = 50, // 50µs max jitter
BandwidthReservationKbps = 1000, // 1 Mbps reserved
EnableFrameReplication = true // Redundancy for reliability
});
context.Logger.Info($"TSN stream reserved: {_controlStream.StreamId}");
// Subscribe to time-critical events
await context.MessageBus.SubscribeAsync<ControlCommand>(
HandleControlCommand,
new SubscriptionOptions { Queue = "tsn.control.critical" });
return ModuleStatus.Initialized;
}
public async Task<ModuleStatus> StartAsync()
{
if (_tsnContext != null)
{
// Verify time synchronization before starting
var syncState = _tsnContext.GetSyncState();
if (syncState != TsnSyncState.Synchronized)
{
_context.Logger.Warning("Waiting for time synchronization...");
// Wait up to 5 seconds for sync
var timeout = DateTime.UtcNow.AddSeconds(5);
while (_tsnContext.GetSyncState() != TsnSyncState.Synchronized
&& DateTime.UtcNow < timeout)
{
await Task.Delay(100);
}
if (_tsnContext.GetSyncState() != TsnSyncState.Synchronized)
{
return ModuleStatus.Failed;
}
}
// Start 1kHz control loop
_controlTimer = new Timer(ControlLoop, null, 0, 1);
}
return ModuleStatus.Running;
}
private async void ControlLoop(object state)
{
var startTime = _tsnContext.GetSynchronizedTime();
try
{
// Perform control calculations
var controlOutput = CalculateControl();
// Send with deterministic timing
await _tsnContext.TsnMessageBus.PublishAsync(
controlOutput,
new TsnMessageOptions
{
StreamHandle = _controlStream,
TrafficClass = TsnTrafficClass.Critical,
MaxLatencyMicroseconds = 200,
RequireTimeSynchronization = true
});
// Schedule next output precisely
var nextTransmit = startTime.AddMilliseconds(1); // Next millisecond
await _tsnContext.TsnMessageBus.ScheduleMessageAsync(
controlOutput,
nextTransmit,
TsnTrafficClass.Critical);
}
catch (Exception ex)
{
_context.Logger.Error($"Control loop error: {ex.Message}");
}
// Monitor cycle time
var cycleTime = (_tsnContext.GetSynchronizedTime() - startTime).TotalMicroseconds;
if (cycleTime > 500)
{
_context.Logger.Warning($"Cycle overrun: {cycleTime}µs");
}
}
private async Task HandleControlCommand(ControlCommand cmd, MessageContext ctx)
{
// Process with deterministic response time
var receiveTime = _tsnContext.GetSynchronizedTime();
// Process command
var result = ProcessCommand(cmd);
// Send acknowledgment with precise timing
var ack = new CommandAck
{
CommandId = cmd.Id,
ReceiveTime = receiveTime,
ProcessTime = _tsnContext.GetSynchronizedTime(),
Result = result
};
await _tsnContext.TsnMessageBus.PublishAsync(ack, new TsnMessageOptions
{
TrafficClass = TsnTrafficClass.Critical,
CorrelationId = ctx.CorrelationId,
MaxLatencyMicroseconds = 100
});
}
}
import asyncio
from datetime import datetime, timedelta
class RealTimeController:
def __init__(self):
self.tsn_context = None
self.control_stream = None
self.control_task = None
async def initialize(self, context):
# Check if TSN is available
self.tsn_context = context.get_tsn_context()
if not self.tsn_context:
context["logger"].warning("TSN not available - running in compatibility mode")
return "initialized"
# Reserve a TSN stream for deterministic communication
self.control_stream = await self.tsn_context.reserve_stream({
"stream_id": f"control-{self.id}",
"traffic_class": "Critical",
"max_latency_microseconds": 500,
"max_jitter_microseconds": 50,
"bandwidth_reservation_kbps": 1000,
"enable_frame_replication": True
})
context["logger"].info(f"TSN stream reserved: {self.control_stream.stream_id}")
# Subscribe to time-critical events
await context["message_bus"].subscribe(
"control.command",
self.handle_control_command,
{"queue": "tsn.control.critical"}
)
return "initialized"
async def start(self):
if self.tsn_context:
# Verify time synchronization before starting
sync_state = self.tsn_context.get_sync_state()
if sync_state != "synchronized":
self.context["logger"].warning("Waiting for time synchronization...")
# Wait up to 5 seconds for sync
timeout = datetime.utcnow() + timedelta(seconds=5)
while (self.tsn_context.get_sync_state() != "synchronized"
and datetime.utcnow() < timeout):
await asyncio.sleep(0.1)
if self.tsn_context.get_sync_state() != "synchronized":
return "failed"
# Start 1kHz control loop
self.control_task = asyncio.create_task(self.control_loop())
return "running"
async def control_loop(self):
while True:
start_time = self.tsn_context.get_synchronized_time()
try:
# Perform control calculations
control_output = self.calculate_control()
# Send with deterministic timing
await self.tsn_context.tsn_message_bus.publish(
control_output,
{
"stream_handle": self.control_stream,
"traffic_class": "Critical",
"max_latency_microseconds": 200,
"require_time_synchronization": True
}
)
# Schedule next output precisely
next_transmit = start_time + timedelta(milliseconds=1)
await self.tsn_context.tsn_message_bus.schedule_message(
control_output,
next_transmit,
"Critical"
)
except Exception as e:
self.context["logger"].error(f"Control loop error: {e}")
# Monitor cycle time
cycle_time = (self.tsn_context.get_synchronized_time() - start_time).total_seconds() * 1e6
if cycle_time > 500:
self.context["logger"].warning(f"Cycle overrun: {cycle_time}µs")
# Sleep until next cycle
await asyncio.sleep(0.001) # 1ms
async def handle_control_command(self, cmd, ctx):
# Process with deterministic response time
receive_time = self.tsn_context.get_synchronized_time()
# Process command
result = self.process_command(cmd)
# Send acknowledgment with precise timing
ack = {
"command_id": cmd["id"],
"receive_time": receive_time.isoformat(),
"process_time": self.tsn_context.get_synchronized_time().isoformat(),
"result": result
}
await self.tsn_context.tsn_message_bus.publish(ack, {
"traffic_class": "Critical",
"correlation_id": ctx["correlation_id"],
"max_latency_microseconds": 100
})
class RealTimeController : public nexus::ModuleBase {
private:
nexus::ITsnModuleContext* tsn_context_;
TsnStreamHandle control_stream_;
std::jthread control_thread_;
std::atomic running_{false};
public:
async_result initialize(nexus::IModuleContext* context) override {
// Check if TSN is available
tsn_context_ = dynamic_cast(context);
if (!tsn_context_) {
context->logger()->warning("TSN not available - running in compatibility mode");
co_return ModuleStatus::Initialized;
}
// Reserve a TSN stream for deterministic communication
TsnStreamConfig config{
.stream_id = fmt::format("control-{}", id()),
.traffic_class = TsnTrafficClass::Critical,
.max_latency_microseconds = 500,
.max_jitter_microseconds = 50,
.bandwidth_reservation_kbps = 1000,
.enable_frame_replication = true
};
control_stream_ = co_await tsn_context_->reserve_stream(config);
context->logger()->info("TSN stream reserved: {}", control_stream_.stream_id);
// Subscribe to time-critical events
co_await context->message_bus()->subscribe(
[this](const ControlCommand& cmd, const MessageContext& ctx) -> async_result {
co_await handle_control_command(cmd, ctx);
},
SubscriptionOptions{.queue = "tsn.control.critical"}
);
co_return ModuleStatus::Initialized;
}
async_result start() override {
if (tsn_context_) {
// Verify time synchronization before starting
auto sync_state = tsn_context_->get_sync_state();
if (sync_state != TsnSyncState::Synchronized) {
logger()->warning("Waiting for time synchronization...");
// Wait up to 5 seconds for sync
auto timeout = std::chrono::steady_clock::now() + std::chrono::seconds(5);
while (tsn_context_->get_sync_state() != TsnSyncState::Synchronized
&& std::chrono::steady_clock::now() < timeout) {
co_await std::this_thread::sleep_for(std::chrono::milliseconds(100));
}
if (tsn_context_->get_sync_state() != TsnSyncState::Synchronized) {
co_return ModuleStatus::Failed;
}
}
// Start 1kHz control loop
running_ = true;
control_thread_ = std::jthread([this](std::stop_token token) {
control_loop(token);
});
}
co_return ModuleStatus::Running;
}
private:
void control_loop(std::stop_token token) {
using namespace std::chrono;
auto next_cycle = steady_clock::now();
while (!token.stop_requested() && running_) {
auto start_time = tsn_context_->get_synchronized_time();
try {
// Perform control calculations
auto control_output = calculate_control();
// Send with deterministic timing
tsn_context_->tsn_message_bus()->publish(
control_output,
TsnMessageOptions{
.stream_handle = control_stream_,
.traffic_class = TsnTrafficClass::Critical,
.max_latency_microseconds = 200,
.require_time_synchronization = true
}
).get();
// Schedule next output precisely
auto next_transmit = start_time + milliseconds(1);
tsn_context_->tsn_message_bus()->schedule_message(
control_output,
next_transmit,
TsnTrafficClass::Critical
).get();
} catch (const std::exception& ex) {
logger()->error("Control loop error: {}", ex.what());
}
// Monitor cycle time
auto cycle_time = duration_cast(
tsn_context_->get_synchronized_time() - start_time
).count();
if (cycle_time > 500) {
logger()->warning("Cycle overrun: {}µs", cycle_time);
}
// Sleep until next cycle
next_cycle += milliseconds(1);
std::this_thread::sleep_until(next_cycle);
}
}
async_result handle_control_command(const ControlCommand& cmd, const MessageContext& ctx) {
// Process with deterministic response time
auto receive_time = tsn_context_->get_synchronized_time();
// Process command
auto result = process_command(cmd);
// Send acknowledgment with precise timing
CommandAck ack{
.command_id = cmd.id,
.receive_time = receive_time,
.process_time = tsn_context_->get_synchronized_time(),
.result = result
};
co_await tsn_context_->tsn_message_bus()->publish(
ack,
TsnMessageOptions{
.traffic_class = TsnTrafficClass::Critical,
.correlation_id = ctx.correlation_id,
.max_latency_microseconds = 100
}
);
}
};
classdef RealTimeController < nexus.Module
properties (Access = private)
tsnContext
controlStream
controlTimer
isRunning = false
end
methods
function status = initialize(obj, context)
% Check if TSN is available
obj.tsnContext = context.getTsnContext();
if isempty(obj.tsnContext)
context.Logger.warning('TSN not available - running in compatibility mode');
status = nexus.ModuleStatus.Initialized;
return;
end
% Reserve a TSN stream for deterministic communication
streamConfig = struct(...
'StreamId', sprintf('control-%s', obj.Id), ...
'TrafficClass', 'Critical', ...
'MaxLatencyMicroseconds', 500, ...
'MaxJitterMicroseconds', 50, ...
'BandwidthReservationKbps', 1000, ...
'EnableFrameReplication', true);
obj.controlStream = obj.tsnContext.reserveStream(streamConfig);
context.Logger.info(sprintf('TSN stream reserved: %s', obj.controlStream.StreamId));
% Subscribe to time-critical events
context.MessageBus.subscribe('control.command', @obj.handleControlCommand, ...
struct('Queue', 'tsn.control.critical'));
status = nexus.ModuleStatus.Initialized;
end
function status = start(obj)
if ~isempty(obj.tsnContext)
% Verify time synchronization before starting
syncState = obj.tsnContext.getSyncState();
if ~strcmp(syncState, 'Synchronized')
obj.Logger.warning('Waiting for time synchronization...');
% Wait up to 5 seconds for sync
timeout = datetime('now') + seconds(5);
while ~strcmp(obj.tsnContext.getSyncState(), 'Synchronized') && ...
datetime('now') < timeout
pause(0.1);
end
if ~strcmp(obj.tsnContext.getSyncState(), 'Synchronized')
status = nexus.ModuleStatus.Failed;
return;
end
end
% Start 1kHz control loop
obj.isRunning = true;
obj.controlTimer = timer(...
'ExecutionMode', 'fixedRate', ...
'Period', 0.001, ... % 1ms
'TimerFcn', @(~,~) obj.controlLoop());
start(obj.controlTimer);
end
status = nexus.ModuleStatus.Running;
end
function controlLoop(obj)
startTime = obj.tsnContext.getSynchronizedTime();
try
% Perform control calculations
controlOutput = obj.calculateControl();
% Send with deterministic timing
options = struct(...
'StreamHandle', obj.controlStream, ...
'TrafficClass', 'Critical', ...
'MaxLatencyMicroseconds', 200, ...
'RequireTimeSynchronization', true);
obj.tsnContext.TsnMessageBus.publish(controlOutput, options);
% Schedule next output precisely
nextTransmit = startTime + milliseconds(1);
obj.tsnContext.TsnMessageBus.scheduleMessage(...
controlOutput, nextTransmit, 'Critical');
catch ME
obj.Logger.error(sprintf('Control loop error: %s', ME.message));
end
% Monitor cycle time
cycleTime = milliseconds(obj.tsnContext.getSynchronizedTime() - startTime);
if cycleTime > 0.5 % 500µs
obj.Logger.warning(sprintf('Cycle overrun: %.1fµs', cycleTime * 1000));
end
end
function handleControlCommand(obj, cmd, ctx)
% Process with deterministic response time
receiveTime = obj.tsnContext.getSynchronizedTime();
% Process command
result = obj.processCommand(cmd);
% Send acknowledgment with precise timing
ack = struct(...
'CommandId', cmd.Id, ...
'ReceiveTime', receiveTime, ...
'ProcessTime', obj.tsnContext.getSynchronizedTime(), ...
'Result', result);
options = struct(...
'TrafficClass', 'Critical', ...
'CorrelationId', ctx.CorrelationId, ...
'MaxLatencyMicroseconds', 100);
obj.tsnContext.TsnMessageBus.publish(ack, options);
end
function stop(obj)
if ~isempty(obj.controlTimer)
stop(obj.controlTimer);
delete(obj.controlTimer);
end
obj.isRunning = false;
end
end
end
// LabVIEW G-Code Representation for TSN Real-Time Controller
// RealTimeController.vi
[Initialize Method]
├─ Get TSN Context
│ ├─ Call Context.GetTsnContext
│ └─ Output: TSN Context Reference
│
├─ Case Structure (TSN Available?)
│ ├─ False Case:
│ │ ├─ Log Warning: "TSN not available - compatibility mode"
│ │ └─ Return Status: Initialized
│ │
│ └─ True Case:
│ ├─ Create Stream Config Cluster
│ │ ├─ StreamId: Format String("control-%s", Module.Id)
│ │ ├─ TrafficClass: "Critical"
│ │ ├─ MaxLatencyMicroseconds: 500
│ │ ├─ MaxJitterMicroseconds: 50
│ │ ├─ BandwidthReservationKbps: 1000
│ │ └─ EnableFrameReplication: True
│ │
│ ├─ Reserve TSN Stream
│ │ ├─ Call TsnContext.ReserveStream
│ │ └─ Output: Control Stream Handle
│ │
│ ├─ Log Info: Stream Reserved
│ │
│ └─ Subscribe to Events
│ ├─ Topic: "control.command"
│ ├─ Handler: HandleControlCommand.vi
│ └─ Options: Queue="tsn.control.critical"
[Start Method]
├─ Case Structure (TSN Context Valid?)
│ └─ True Case:
│ ├─ Check Sync State
│ │ └─ Get TsnContext.SyncState
│ │
│ ├─ While Loop (Wait for Sync)
│ │ ├─ Condition: NOT Synchronized AND Timeout Not Reached
│ │ ├─ Wait 100ms
│ │ └─ Check Sync State
│ │
│ ├─ Case Structure (Synchronized?)
│ │ ├─ False: Return Failed
│ │ └─ True: Continue
│ │
│ └─ Start Control Loop
│ ├─ Create Timed Loop Structure
│ │ ├─ Period: 1ms (1kHz)
│ │ ├─ Priority: Time-Critical
│ │ └─ Processor: Dedicated Core
│ └─ Set Running Flag: True
[Control Loop Timed Structure]
├─ Get Start Time
│ └─ TsnContext.GetSynchronizedTime
│
├─ Try-Catch Structure
│ ├─ Try:
│ │ ├─ Calculate Control Output
│ │ │ └─ Call CalculateControl SubVI
│ │ │
│ │ ├─ Create TSN Message Options
│ │ │ ├─ StreamHandle: Control Stream
│ │ │ ├─ TrafficClass: Critical
│ │ │ ├─ MaxLatencyMicroseconds: 200
│ │ │ └─ RequireTimeSynchronization: True
│ │ │
│ │ ├─ Publish Message
│ │ │ └─ TsnMessageBus.Publish
│ │ │
│ │ └─ Schedule Next Message
│ │ ├─ NextTime: StartTime + 1ms
│ │ └─ TsnMessageBus.ScheduleMessage
│ │
│ └─ Catch:
│ └─ Log Error
│
├─ Calculate Cycle Time
│ ├─ End Time: GetSynchronizedTime
│ ├─ Duration: EndTime - StartTime
│ └─ Convert to Microseconds
│
├─ Case Structure (Cycle Overrun?)
│ └─ Condition: CycleTime > 500µs
│ └─ Log Warning with Cycle Time
│
└─ Wait Until Next Period (Handled by Timed Loop)
[Handle Control Command SubVI]
├─ Input Terminals
│ ├─ Command (Cluster)
│ └─ Context (Cluster)
│
├─ Get Receive Time
│ └─ TsnContext.GetSynchronizedTime
│
├─ Process Command
│ ├─ Call ProcessCommand SubVI
│ └─ Output: Result
│
├─ Create Acknowledgment
│ ├─ Build Cluster
│ │ ├─ CommandId: Command.Id
│ │ ├─ ReceiveTime: Timestamp
│ │ ├─ ProcessTime: GetSynchronizedTime
│ │ └─ Result: Processing Result
│ │
│ └─ Create Options
│ ├─ TrafficClass: Critical
│ ├─ CorrelationId: Context.CorrelationId
│ └─ MaxLatencyMicroseconds: 100
│
└─ Send Acknowledgment
└─ TsnMessageBus.Publish
[TSN Configuration Panel]
├─ Network Interface Selector
├─ Time Sync Status LED
├─ Stream Statistics Display
│ ├─ Latency Histogram
│ ├─ Jitter Graph
│ └─ Bandwidth Usage
└─ Traffic Class Priority Map
TSN Traffic Classes
NEXUS-1 supports 7 TSN traffic classes mapped to IEEE 802.1Q priorities:
Traffic Class | Priority | Typical Use Case | Max Latency |
---|---|---|---|
NetworkControl |
7 | Network management, PTP sync | < 100µs |
InternetworkControl |
6 | Cross-network control | < 200µs |
Critical |
5 | Safety systems, motion control | < 500µs |
ExcellentEffort |
4 | High-priority monitoring | < 1ms |
Standard |
3 | Normal operations, HMI | < 10ms |
Background |
2 | File transfers, logging | < 100ms |
BestEffort |
0-1 | Non-critical data | No guarantee |
Stream Reservation
Reserve dedicated bandwidth and QoS for critical data flows:
// Reserve multiple streams for different data types
var videoStream = await _tsnContext.ReserveStreamAsync(new TsnStreamConfig
{
StreamId = "video-inspection",
TrafficClass = TsnTrafficClass.ExcellentEffort,
BandwidthReservationKbps = 50000, // 50 Mbps for video
MaxFrameSize = 9000, // Jumbo frames
VlanId = 200
});
var sensorStream = await _tsnContext.ReserveStreamAsync(new TsnStreamConfig
{
StreamId = "sensor-data",
TrafficClass = TsnTrafficClass.Critical,
BandwidthReservationKbps = 100, // 100 kbps for sensors
MaxLatencyMicroseconds = 250,
EnableFrameReplication = true // Redundant transmission
});
// Use streams for publishing
await _tsnContext.TsnMessageBus.PublishAsync(videoFrame, new TsnMessageOptions
{
StreamHandle = videoStream,
RequireTimeSynchronization = false // Video doesn't need precise sync
});
await _tsnContext.TsnMessageBus.PublishAsync(sensorData, new TsnMessageOptions
{
StreamHandle = sensorStream,
RequireTimeSynchronization = true // Sensors need timestamp accuracy
});
Time-Aware Scheduling
Schedule messages for transmission at precise future times:
// Coordinate multiple actuators with synchronized timing
var syncTime = _tsnContext.GetSynchronizedTime();
var actuationTime = syncTime.AddMilliseconds(10); // 10ms in future
// Schedule all actuator commands for simultaneous execution
var tasks = new List<Task>();
foreach (var actuator in actuators)
{
var command = new ActuatorCommand
{
Id = actuator.Id,
Position = actuator.TargetPosition
};
tasks.Add(_tsnContext.TsnMessageBus.ScheduleMessageAsync(
command,
actuationTime, // All execute at same time
TsnTrafficClass.Critical
));
}
await Task.WhenAll(tasks);
_context.Logger.Info($"Scheduled {actuators.Count} actuators for {actuationTime}");
Monitoring TSN Performance
Monitor network performance and synchronization status:
// Get TSN network statistics
var stats = await _tsnContext.TsnMessageBus.GetNetworkStatsAsync();
_context.Logger.Info($@"TSN Network Stats:
Sync State: {stats.SyncState}
Clock Offset: {stats.ClockOffset.TotalMicroseconds}µs
Transmitted: {stats.TransmittedFrames}
Received: {stats.ReceivedFrames}
Dropped: {stats.DroppedFrames}
Late Frames: {stats.LateFrames}
Avg Latency: {stats.AverageLatencyMicroseconds}µs
Max Latency: {stats.MaxLatencyMicroseconds}µs
Jitter: {stats.JitterMicroseconds}µs");
// Monitor sync state changes
if (stats.SyncState != TsnSyncState.Synchronized)
{
_context.Logger.Warning("Time sync lost - degrading to best-effort mode");
await SwitchToBestEffortMode();
}
// Track latency violations
if (stats.MaxLatencyMicroseconds > 500)
{
_context.Metrics.RecordCounter("tsn.latency.violations");
_context.Logger.Error($"Latency violation: {stats.MaxLatencyMicroseconds}µs");
}
Best Practices for TSN
- Hardware Selection: Use TSN-capable NICs and switches with hardware timestamping
- Network Design: Separate VLANs for different traffic classes
- Time Sync First: Always verify synchronization before time-critical operations
- Resource Planning: Calculate bandwidth requirements and reserve streams appropriately
- Graceful Degradation: Implement fallback for when TSN is unavailable
- Monitor Everything: Track latency, jitter, and sync state continuously
- Test Under Load: Validate timing under worst-case network conditions
- Keep Payloads Small: Minimize message size for lowest latency
Common TSN Use Cases
Motion Control
Coordinate multi-axis motion with microsecond precision for robotics and CNC machines.
- Sub-millisecond control loops
- Synchronized multi-axis movement
- Deterministic sensor feedback
Process Control
Time-critical monitoring and control of industrial processes.
- Guaranteed sensor sampling rates
- Coordinated actuator control
- Safety interlock timing
Power Systems
Synchronized measurements and control for electrical grids.
- Synchrophasor measurements
- Protection relay coordination
- Distributed generation control
Best Practices
Follow these guidelines to build robust, scalable, and maintainable NEXUS-1 modules.
Module Design
- Single Responsibility: Each module should have one clear purpose
- Loose Coupling: Modules communicate only through the message bus
- High Cohesion: Related functionality should be grouped together
- Fail Fast: Detect and report errors early in the process
- Graceful Degradation: Handle partial failures without crashing
Message Handling
- Use Structured Messages: Define clear message schemas and validate inputs
- Topic Naming: Use hierarchical topics (e.g., "sensors.temperature.zone1")
- Error Handling: Always handle exceptions in message handlers
- Timeouts: Set appropriate timeouts for request/response patterns
- Idempotency: Design handlers to be idempotent where possible
Performance
- Async Operations: Use async/await for I/O-bound operations
- Batch Processing: Group related operations to reduce overhead
- Resource Management: Properly dispose of resources and connections
- Memory Usage: Monitor and optimize memory consumption
- Message Size: Keep message payloads reasonably sized
Security
- Input Validation: Always validate incoming data
- Secrets Management: Never hard-code credentials or keys
- Least Privilege: Request only the capabilities your module needs
- Audit Logging: Log security-relevant events
- Secure Communication: Use encrypted channels for sensitive data
Testing
- Unit Tests: Test individual components in isolation
- Integration Tests: Test module interactions with the message bus
- Error Scenarios: Test failure cases and error handling
- Performance Tests: Measure and optimize critical paths
- Continuous Integration: Automate testing in your CI/CD pipeline
GitOps & CI/CD Integration
Deploy and manage your Nexus-1 modules using modern GitOps workflows and CI/CD pipelines.
New in Nexus-1 v2.0
Full GitOps support with declarative configuration management, automated deployments, and comprehensive CI/CD integration across GitHub Actions, GitLab CI, Jenkins, and Tekton.
Overview
Nexus-1 provides enterprise-grade GitOps integration enabling:
- Declarative Configuration: Define module configurations in Git repositories
- Automated Synchronization: Automatic deployment of configuration changes
- Multi-Environment Management: Separate configurations for dev, staging, and production
- Progressive Delivery: Canary, Blue/Green, and rolling deployments
- Pipeline Integration: Native support for major CI/CD platforms
- Security & Compliance: Image signing, SBOM generation, and policy enforcement
Deployment Manager API
Use the IDeploymentManager
interface to programmatically manage deployments:
using Nexus.Contracts.GitOps;
using Microsoft.Extensions.DependencyInjection;
public class DeploymentExample
{
private readonly IDeploymentManager _deploymentManager;
public DeploymentExample(IServiceProvider services)
{
_deploymentManager = services.GetRequiredService<IDeploymentManager>();
}
public async Task DeployModuleWithCanaryAsync()
{
// Configure canary deployment
var request = new DeploymentRequest
{
ModuleId = "temperature-monitor",
Version = "2.1.0",
Environment = "production",
Strategy = DeploymentStrategy.Canary,
Configuration = new Dictionary<string, string>
{
["sampling_rate"] = "1000",
["alert_threshold"] = "75.0"
},
Options = new DeploymentOptions
{
Canary = new CanaryOptions
{
InitialPercentage = 10,
StepDuration = TimeSpan.FromMinutes(10),
TrafficSteps = new[] { 10, 25, 50, 100 },
AnalysisTemplate = new AnalysisTemplate
{
Name = "performance-analysis",
Metrics = new Dictionary<string, string>
{
["error_rate"] = "prometheus",
["latency_p99"] = "prometheus"
},
SuccessCondition = "error_rate < 0.01 && latency_p99 < 100",
FailureCondition = "error_rate > 0.05 || latency_p99 > 500"
}
}
}
};
// Deploy with monitoring
var result = await _deploymentManager.DeployModuleAsync(request);
Console.WriteLine($"Deployment started: {result.DeploymentId}");
// Monitor deployment progress
while (true)
{
var status = await _deploymentManager.GetDeploymentStatusAsync(result.DeploymentId);
Console.WriteLine($"Status: {status.State}, Ready: {status.ReplicasReady}/{status.ReplicasTotal}");
if (status.State == DeploymentState.Succeeded)
{
Console.WriteLine("Deployment completed successfully!");
break;
}
else if (status.State == DeploymentState.Failed)
{
Console.WriteLine($"Deployment failed: {status.Conditions.FirstOrDefault()?.Message}");
// Automatic rollback triggered
await _deploymentManager.RollbackDeploymentAsync(
request.ModuleId,
status.CurrentRevision
);
break;
}
await Task.Delay(TimeSpan.FromSeconds(30));
}
}
public async Task PromoteToProductionAsync(string deploymentId)
{
// Promote successful staging deployment to production
var result = await _deploymentManager.PromoteDeploymentAsync(
deploymentId,
"production"
);
if (result.Success)
{
Console.WriteLine($"Promoted to production: {result.NewDeploymentId}");
}
else
{
foreach (var validation in result.ValidationResults)
{
Console.WriteLine($"Validation {validation.Name}: {validation.Message}");
}
}
}
}
from nexus_sdk.gitops import DeploymentManager, DeploymentRequest, DeploymentStrategy
from nexus_sdk.gitops import CanaryOptions, AnalysisTemplate, DeploymentState
import asyncio
class DeploymentExample:
def __init__(self, deployment_manager: DeploymentManager):
self.deployment_manager = deployment_manager
async def deploy_module_with_canary(self):
# Configure canary deployment
request = DeploymentRequest(
module_id="temperature-monitor",
version="2.1.0",
environment="production",
strategy=DeploymentStrategy.CANARY,
configuration={
"sampling_rate": "1000",
"alert_threshold": "75.0"
},
options={
"canary": CanaryOptions(
initial_percentage=10,
step_duration=600, # 10 minutes
traffic_steps=[10, 25, 50, 100],
analysis_template=AnalysisTemplate(
name="performance-analysis",
metrics={
"error_rate": "prometheus",
"latency_p99": "prometheus"
},
success_condition="error_rate < 0.01 and latency_p99 < 100",
failure_condition="error_rate > 0.05 or latency_p99 > 500"
)
)
}
)
# Deploy with monitoring
result = await self.deployment_manager.deploy_module(request)
print(f"Deployment started: {result.deployment_id}")
# Monitor deployment progress
while True:
status = await self.deployment_manager.get_deployment_status(
result.deployment_id
)
print(f"Status: {status.state}, Ready: {status.replicas_ready}/{status.replicas_total}")
if status.state == DeploymentState.SUCCEEDED:
print("Deployment completed successfully!")
break
elif status.state == DeploymentState.FAILED:
print(f"Deployment failed: {status.conditions[0].message if status.conditions else 'Unknown'}")
# Automatic rollback triggered
await self.deployment_manager.rollback_deployment(
request.module_id,
status.current_revision
)
break
await asyncio.sleep(30)
async def promote_to_production(self, deployment_id: str):
# Promote successful staging deployment to production
result = await self.deployment_manager.promote_deployment(
deployment_id,
"production"
)
if result.success:
print(f"Promoted to production: {result.new_deployment_id}")
else:
for validation in result.validation_results:
print(f"Validation {validation.name}: {validation.message}")
# Usage
async def main():
deployment_manager = DeploymentManager()
example = DeploymentExample(deployment_manager)
await example.deploy_module_with_canary()
asyncio.run(main())
#include <nexus/sdk/gitops.h>
#include <iostream>
#include <chrono>
using namespace nexus::sdk::gitops;
using namespace std::chrono_literals;
class DeploymentExample {
public:
DeploymentExample(std::shared_ptr<IDeploymentManager> deployment_manager)
: deployment_manager_(deployment_manager) {}
async::task<void> deploy_module_with_canary() {
// Configure canary deployment
DeploymentRequest request{
.module_id = "temperature-monitor",
.version = "2.1.0",
.environment = "production",
.strategy = DeploymentStrategy::Canary,
.configuration = {
{"sampling_rate", "1000"},
{"alert_threshold", "75.0"}
},
.options = DeploymentOptions{
.canary = CanaryOptions{
.initial_percentage = 10,
.step_duration = 10min,
.traffic_steps = {10, 25, 50, 100},
.analysis_template = AnalysisTemplate{
.name = "performance-analysis",
.metrics = {
{"error_rate", "prometheus"},
{"latency_p99", "prometheus"}
},
.success_condition = "error_rate < 0.01 && latency_p99 < 100",
.failure_condition = "error_rate > 0.05 || latency_p99 > 500"
}
}
}
};
// Deploy with monitoring
auto result = co_await deployment_manager_->deploy_module_async(request);
std::cout << "Deployment started: " << result.deployment_id << std::endl;
// Monitor deployment progress
while (true) {
auto status = co_await deployment_manager_->get_deployment_status_async(
result.deployment_id
);
std::cout << "Status: " << to_string(status.state)
<< ", Ready: " << status.replicas_ready
<< "/" << status.replicas_total << std::endl;
if (status.state == DeploymentState::Succeeded) {
std::cout << "Deployment completed successfully!" << std::endl;
break;
} else if (status.state == DeploymentState::Failed) {
std::cout << "Deployment failed: "
<< (status.conditions.empty() ? "Unknown" : status.conditions[0].message)
<< std::endl;
// Automatic rollback triggered
co_await deployment_manager_->rollback_deployment_async(
request.module_id,
status.current_revision
);
break;
}
co_await async::sleep_for(30s);
}
}
async::task<void> promote_to_production(const std::string& deployment_id) {
// Promote successful staging deployment to production
auto result = co_await deployment_manager_->promote_deployment_async(
deployment_id,
"production"
);
if (result.success) {
std::cout << "Promoted to production: " << result.new_deployment_id << std::endl;
} else {
for (const auto& validation : result.validation_results) {
std::cout << "Validation " << validation.name
<< ": " << validation.message << std::endl;
}
}
}
private:
std::shared_ptr<IDeploymentManager> deployment_manager_;
};
classdef DeploymentExample < handle
properties (Access = private)
deploymentManager
end
methods
function obj = DeploymentExample(deploymentManager)
obj.deploymentManager = deploymentManager;
end
function deployModuleWithCanary(obj)
% Configure canary deployment
request = nexus.gitops.DeploymentRequest();
request.moduleId = 'temperature-monitor';
request.version = '2.1.0';
request.environment = 'production';
request.strategy = nexus.gitops.DeploymentStrategy.Canary;
% Set configuration
request.configuration = containers.Map(...
{'sampling_rate', 'alert_threshold'}, ...
{'1000', '75.0'});
% Configure canary options
canaryOptions = nexus.gitops.CanaryOptions();
canaryOptions.initialPercentage = 10;
canaryOptions.stepDuration = minutes(10);
canaryOptions.trafficSteps = [10, 25, 50, 100];
% Set up analysis template
analysisTemplate = nexus.gitops.AnalysisTemplate();
analysisTemplate.name = 'performance-analysis';
analysisTemplate.metrics = containers.Map(...
{'error_rate', 'latency_p99'}, ...
{'prometheus', 'prometheus'});
analysisTemplate.successCondition = 'error_rate < 0.01 && latency_p99 < 100';
analysisTemplate.failureCondition = 'error_rate > 0.05 || latency_p99 > 500';
canaryOptions.analysisTemplate = analysisTemplate;
request.options.canary = canaryOptions;
% Deploy with monitoring
result = obj.deploymentManager.deployModule(request);
fprintf('Deployment started: %s\n', result.deploymentId);
% Monitor deployment progress
while true
status = obj.deploymentManager.getDeploymentStatus(result.deploymentId);
fprintf('Status: %s, Ready: %d/%d\n', ...
char(status.state), status.replicasReady, status.replicasTotal);
if status.state == nexus.gitops.DeploymentState.Succeeded
fprintf('Deployment completed successfully!\n');
break;
elseif status.state == nexus.gitops.DeploymentState.Failed
if ~isempty(status.conditions)
fprintf('Deployment failed: %s\n', status.conditions(1).message);
else
fprintf('Deployment failed: Unknown reason\n');
end
% Automatic rollback triggered
obj.deploymentManager.rollbackDeployment(...
request.moduleId, status.currentRevision);
break;
end
pause(30);
end
end
function promoteToProduction(obj, deploymentId)
% Promote successful staging deployment to production
result = obj.deploymentManager.promoteDeployment(...
deploymentId, 'production');
if result.success
fprintf('Promoted to production: %s\n', result.newDeploymentId);
else
for i = 1:length(result.validationResults)
validation = result.validationResults(i);
fprintf('Validation %s: %s\n', validation.name, validation.message);
end
end
end
end
end
% Usage example
deploymentManager = nexus.gitops.DeploymentManager();
example = DeploymentExample(deploymentManager);
example.deployModuleWithCanary();
// LabVIEW Deployment Example
// This would be implemented as a VI with the following structure:
// 1. Create Deployment Request VI
// Inputs:
// - Module ID: "temperature-monitor" (string)
// - Version: "2.1.0" (string)
// - Environment: "production" (string)
// - Strategy: Canary (enum)
// - Configuration Cluster:
// - sampling_rate: "1000"
// - alert_threshold: "75.0"
//
// Canary Options Cluster:
// - Initial Percentage: 10
// - Step Duration: 600000 (ms)
// - Traffic Steps: [10, 25, 50, 100] (array)
// - Analysis Template:
// - Name: "performance-analysis"
// - Metrics: Map of metric names to sources
// - Success Condition: "error_rate < 0.01 && latency_p99 < 100"
// - Failure Condition: "error_rate > 0.05 || latency_p99 > 500"
// 2. Deploy Module VI
// Uses: Nexus.GitOps.DeploymentManager.lvlib:DeployModule.vi
// Inputs: Deployment Request (cluster)
// Outputs:
// - Deployment Result (cluster)
// - Error (cluster)
// 3. Monitor Deployment VI (While Loop)
// Loop Components:
// a. Get Deployment Status SubVI
// - Input: Deployment ID
// - Output: Status cluster (state, replicas ready/total)
//
// b. Case Structure based on State:
// - Succeeded: Exit loop, display success
// - Failed: Call Rollback VI, exit loop
// - Other: Continue monitoring
//
// c. Wait 30000ms between checks
// 4. Rollback Deployment VI (if needed)
// Uses: Nexus.GitOps.DeploymentManager.lvlib:RollbackDeployment.vi
// Inputs:
// - Module ID
// - Target Revision
// Outputs:
// - Rollback Result
// - Error
// Error Handling:
// - Each VI should have error in/out clusters
// - Use Simple Error Handler.vi for display
// - Log errors to file using Write to Text File.vi
// Front Panel Controls:
// - Module Configuration (cluster)
// - Deployment Strategy (enum)
// - Environment Selector (ring)
// - Start Deployment (button)
// - Deployment Status (string indicator)
// - Progress Bar (numeric indicator)
// Block Diagram would wire these components together
// with proper error handling and state management
CI/CD Pipeline Integration
Integrate Nexus-1 deployments into your existing CI/CD pipelines:
name: Deploy Module to Nexus-1
on:
push:
branches: [main]
tags: ['v*']
pull_request:
branches: [main]
jobs:
build-test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Setup Module SDK
uses: nexus-1/setup-sdk@v1
with:
sdk-version: '1.0.0'
- name: Build Module
run: |
nexus module build \
--module-path ./src \
--output ./dist
- name: Run Tests
run: |
nexus module test \
--module-path ./src \
--coverage
- name: Security Scan
uses: nexus-1/security-scan@v1
with:
module-path: ./dist
severity: HIGH,CRITICAL
- name: Upload Artifacts
uses: actions/upload-artifact@v4
with:
name: module-package
path: ./dist/*.nxm
deploy-staging:
needs: build-test
if: github.ref == 'refs/heads/main'
runs-on: ubuntu-latest
environment: staging
steps:
- uses: actions/checkout@v4
- name: Download Module
uses: actions/download-artifact@v4
with:
name: module-package
path: ./dist
- name: Deploy to Staging
uses: nexus-1/deploy@v1
with:
module-package: ./dist/*.nxm
environment: staging
strategy: rolling
nexus-url: ${{ secrets.NEXUS_STAGING_URL }}
api-key: ${{ secrets.NEXUS_STAGING_API_KEY }}
- name: Run Integration Tests
uses: nexus-1/integration-tests@v1
with:
environment: staging
test-suite: ./tests/integration
deploy-production:
needs: deploy-staging
if: startsWith(github.ref, 'refs/tags/v')
runs-on: ubuntu-latest
environment: production
steps:
- uses: actions/checkout@v4
- name: Download Module
uses: actions/download-artifact@v4
with:
name: module-package
path: ./dist
- name: Deploy to Production
uses: nexus-1/deploy@v1
with:
module-package: ./dist/*.nxm
environment: production
strategy: canary
canary-percentage: 10
canary-duration: 10m
auto-promote: true
nexus-url: ${{ secrets.NEXUS_PROD_URL }}
api-key: ${{ secrets.NEXUS_PROD_API_KEY }}
- name: Monitor Deployment
uses: nexus-1/monitor-deployment@v1
with:
deployment-id: ${{ steps.deploy.outputs.deployment-id }}
metrics-query: |
error_rate < 0.01 AND
latency_p99 < 100ms
timeout: 30m
stages:
- build
- test
- security
- deploy
- verify
variables:
MODULE_NAME: temperature-monitor
NEXUS_REGISTRY: registry.nexus-1.io
build:module:
stage: build
image: nexus-1/sdk:latest
script:
- nexus module build --module-path ./src --output ./dist
- nexus module package --input ./dist --output ${MODULE_NAME}.nxm
artifacts:
paths:
- ${MODULE_NAME}.nxm
expire_in: 1 hour
test:unit:
stage: test
image: nexus-1/sdk:latest
script:
- nexus module test --module-path ./src --type unit
coverage: '/Total coverage: \d+\.\d+%/'
test:integration:
stage: test
services:
- name: nexus-1/simulator:latest
alias: nexus-simulator
script:
- nexus module test --module-path ./src --type integration
--nexus-url http://nexus-simulator:8080
security:scan:
stage: security
image: aquasec/trivy:latest
script:
- trivy fs --severity HIGH,CRITICAL --exit-code 1 .
- trivy config --severity HIGH,CRITICAL --exit-code 1 .
security:sign:
stage: security
needs: ["build:module"]
image: sigstore/cosign:latest
script:
- cosign sign --yes ${MODULE_NAME}.nxm
deploy:staging:
stage: deploy
needs: ["security:sign"]
environment:
name: staging
url: https://staging.nexus-1.io
only:
- main
script:
- |
nexus deploy \
--module ${MODULE_NAME}.nxm \
--environment staging \
--strategy rolling \
--manifest ./deploy/staging.yaml \
--wait
deploy:production:
stage: deploy
needs: ["deploy:staging", "verify:staging"]
environment:
name: production
url: https://nexus-1.io
when: manual
only:
- tags
script:
- |
nexus deploy \
--module ${MODULE_NAME}.nxm \
--environment production \
--strategy canary \
--canary-steps "10,25,50,100" \
--canary-duration 10m \
--manifest ./deploy/production.yaml \
--auto-rollback
verify:staging:
stage: verify
needs: ["deploy:staging"]
script:
- nexus verify --environment staging --timeout 5m
- nexus test smoke --environment staging
verify:production:
stage: verify
needs: ["deploy:production"]
script:
- nexus verify --environment production --timeout 10m
- nexus test smoke --environment production
- nexus metrics check --query "error_rate < 0.01"
pipeline {
agent {
kubernetes {
yaml '''
apiVersion: v1
kind: Pod
spec:
containers:
- name: nexus-sdk
image: nexus-1/sdk:latest
command: ['sleep', '999999']
- name: docker
image: docker:dind
securityContext:
privileged: true
'''
}
}
environment {
MODULE_NAME = 'temperature-monitor'
NEXUS_STAGING = credentials('nexus-staging')
NEXUS_PROD = credentials('nexus-production')
}
stages {
stage('Build') {
steps {
container('nexus-sdk') {
sh '''
nexus module build \
--module-path ./src \
--output ./dist
nexus module package \
--input ./dist \
--output ${MODULE_NAME}.nxm
'''
}
}
}
stage('Test') {
parallel {
stage('Unit Tests') {
steps {
container('nexus-sdk') {
sh 'nexus module test --type unit --coverage'
}
}
}
stage('Integration Tests') {
steps {
container('nexus-sdk') {
sh 'nexus module test --type integration'
}
}
}
stage('Security Scan') {
steps {
container('nexus-sdk') {
sh '''
nexus security scan \
--module ${MODULE_NAME}.nxm \
--policy security-policy.yaml
'''
}
}
}
}
}
stage('Deploy to Staging') {
when {
branch 'main'
}
steps {
container('nexus-sdk') {
sh '''
nexus deploy \
--module ${MODULE_NAME}.nxm \
--environment staging \
--url ${NEXUS_STAGING_URL} \
--api-key ${NEXUS_STAGING_KEY} \
--strategy rolling \
--wait
'''
}
}
}
stage('Staging Validation') {
when {
branch 'main'
}
steps {
container('nexus-sdk') {
sh '''
nexus verify --environment staging
nexus test smoke --environment staging
nexus metrics check \
--environment staging \
--duration 5m \
--query "error_rate < 0.05"
'''
}
}
}
stage('Deploy to Production') {
when {
tag pattern: "v\\\\d+\\\\.\\\\d+\\\\.\\\\d+", comparator: "REGEXP"
}
input {
message "Deploy to production?"
parameters {
choice(
name: 'DEPLOYMENT_STRATEGY',
choices: ['canary', 'blue-green', 'rolling'],
description: 'Select deployment strategy'
)
}
}
steps {
container('nexus-sdk') {
script {
if (env.DEPLOYMENT_STRATEGY == 'canary') {
sh '''
nexus deploy \
--module ${MODULE_NAME}.nxm \
--environment production \
--url ${NEXUS_PROD_URL} \
--api-key ${NEXUS_PROD_KEY} \
--strategy canary \
--canary-analysis analysis.yaml \
--auto-promote
'''
} else {
sh """
nexus deploy \
--module ${MODULE_NAME}.nxm \
--environment production \
--url ${NEXUS_PROD_URL} \
--api-key ${NEXUS_PROD_KEY} \
--strategy ${env.DEPLOYMENT_STRATEGY}
"""
}
}
}
}
}
}
post {
always {
archiveArtifacts artifacts: '*.nxm', fingerprint: true
junit '**/test-results/*.xml'
publishHTML([
reportDir: 'coverage',
reportFiles: 'index.html',
reportName: 'Coverage Report'
])
}
success {
slackSend(
color: 'good',
message: "Module ${MODULE_NAME} deployed successfully"
)
}
failure {
slackSend(
color: 'danger',
message: "Module ${MODULE_NAME} deployment failed"
)
}
}
}
apiVersion: tekton.dev/v1beta1
kind: Pipeline
metadata:
name: nexus-module-pipeline
spec:
params:
- name: module-name
default: temperature-monitor
- name: git-url
description: Git repository URL
- name: git-revision
default: main
- name: deployment-strategy
default: rolling
workspaces:
- name: shared-data
- name: nexus-credentials
tasks:
- name: fetch-source
taskRef:
name: git-clone
workspaces:
- name: output
workspace: shared-data
params:
- name: url
value: $(params.git-url)
- name: revision
value: $(params.git-revision)
- name: build-module
runAfter: ["fetch-source"]
taskRef:
name: nexus-module-build
workspaces:
- name: source
workspace: shared-data
params:
- name: module-path
value: ./src
- name: output-path
value: ./dist
- name: test-module
runAfter: ["build-module"]
taskRef:
name: nexus-module-test
workspaces:
- name: source
workspace: shared-data
params:
- name: test-type
value: all
- name: coverage-threshold
value: "80"
- name: security-scan
runAfter: ["build-module"]
taskRef:
name: trivy-scan
params:
- name: image
value: $(params.module-name)
- name: severity
value: "HIGH,CRITICAL"
- name: package-module
runAfter: ["test-module", "security-scan"]
taskRef:
name: nexus-module-package
workspaces:
- name: source
workspace: shared-data
params:
- name: module-name
value: $(params.module-name)
- name: version
value: $(tasks.fetch-source.results.commit)
- name: deploy-staging
runAfter: ["package-module"]
taskRef:
name: nexus-deploy
workspaces:
- name: source
workspace: shared-data
- name: credentials
workspace: nexus-credentials
params:
- name: module
value: $(params.module-name).nxm
- name: environment
value: staging
- name: strategy
value: $(params.deployment-strategy)
- name: wait
value: "true"
- name: verify-staging
runAfter: ["deploy-staging"]
taskRef:
name: nexus-verify
params:
- name: environment
value: staging
- name: checks
value:
- health: healthy
- metrics: "error_rate < 0.05"
- endpoints: all
finally:
- name: notify-status
taskRef:
name: slack-notification
params:
- name: message
value: |
Module: $(params.module-name)
Status: $(tasks.status)
Commit: $(tasks.fetch-source.results.commit)
---
apiVersion: tekton.dev/v1beta1
kind: Task
metadata:
name: nexus-module-build
spec:
description: Build Nexus module
workspaces:
- name: source
params:
- name: module-path
type: string
- name: output-path
type: string
steps:
- name: build
image: nexus-1/sdk:latest
workingDir: $(workspaces.source.path)
script: |
#!/bin/bash
set -ex
nexus module build \
--module-path $(params.module-path) \
--output $(params.output-path)
---
apiVersion: tekton.dev/v1beta1
kind: Task
metadata:
name: nexus-deploy
spec:
description: Deploy module to Nexus
workspaces:
- name: source
- name: credentials
params:
- name: module
type: string
- name: environment
type: string
- name: strategy
type: string
- name: wait
type: string
default: "true"
steps:
- name: deploy
image: nexus-1/sdk:latest
workingDir: $(workspaces.source.path)
env:
- name: NEXUS_API_KEY
valueFrom:
secretKeyRef:
name: nexus-credentials
key: api-key
script: |
#!/bin/bash
set -ex
WAIT_FLAG=""
if [[ "$(params.wait)" == "true" ]]; then
WAIT_FLAG="--wait"
fi
nexus deploy \
--module $(params.module) \
--environment $(params.environment) \
--strategy $(params.strategy) \
$WAIT_FLAG
---
apiVersion: triggers.tekton.dev/v1beta1
kind: TriggerTemplate
metadata:
name: nexus-module-triggertemplate
spec:
params:
- name: git-revision
- name: git-url
resourcetemplates:
- apiVersion: tekton.dev/v1beta1
kind: PipelineRun
metadata:
generateName: nexus-module-run-
spec:
pipelineRef:
name: nexus-module-pipeline
params:
- name: git-url
value: $(tt.params.git-url)
- name: git-revision
value: $(tt.params.git-revision)
workspaces:
- name: shared-data
volumeClaimTemplate:
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
- name: nexus-credentials
secret:
secretName: nexus-api-credentials
GitOps Repository Management
Manage module configurations declaratively using Git:
# ArgoCD Application for Nexus Module
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: temperature-monitor
namespace: argocd
finalizers:
- resources-finalizer.argocd.argoproj.io
spec:
project: nexus-modules
source:
repoURL: https://github.com/myorg/nexus-modules
targetRevision: main
path: modules/temperature-monitor
# Helm values for module configuration
helm:
valueFiles:
- values.yaml
- values-production.yaml
parameters:
- name: module.version
value: "2.1.0"
- name: module.replicas
value: "3"
- name: module.resources.requests.memory
value: "512Mi"
- name: module.resources.requests.cpu
value: "500m"
- name: module.config.samplingRate
value: "1000"
- name: module.config.alertThreshold
value: "75.0"
destination:
server: https://kubernetes.default.svc
namespace: nexus-modules
syncPolicy:
automated:
prune: true
selfHeal: true
allowEmpty: false
syncOptions:
- CreateNamespace=true
- PrunePropagationPolicy=foreground
- PruneLast=true
retry:
limit: 5
backoff:
duration: 5s
factor: 2
maxDuration: 3m
revisionHistoryLimit: 10
# Health checks for module
health:
- group: apps
kind: Deployment
name: temperature-monitor
namespace: nexus-modules
---
# ApplicationSet for multi-environment deployment
apiVersion: argoproj.io/v1alpha1
kind: ApplicationSet
metadata:
name: nexus-modules-environments
namespace: argocd
spec:
generators:
- matrix:
generators:
- git:
repoURL: https://github.com/myorg/nexus-modules
revision: main
directories:
- path: modules/*
- list:
elements:
- env: dev
cluster: dev-cluster
- env: staging
cluster: staging-cluster
- env: production
cluster: production-cluster
template:
metadata:
name: '{{path.basename}}-{{env}}'
spec:
project: nexus-modules
source:
repoURL: https://github.com/myorg/nexus-modules
targetRevision: main
path: '{{path}}'
helm:
valueFiles:
- values.yaml
- values-{{env}}.yaml
destination:
server: '{{cluster}}'
namespace: 'nexus-modules-{{env}}'
syncPolicy:
automated:
prune: true
selfHeal: true
# Flux GitRepository source
apiVersion: source.toolkit.fluxcd.io/v1
kind: GitRepository
metadata:
name: nexus-modules
namespace: flux-system
spec:
interval: 1m
ref:
branch: main
url: https://github.com/myorg/nexus-modules
secretRef:
name: github-credentials
---
# HelmRepository for Nexus charts
apiVersion: source.toolkit.fluxcd.io/v1beta2
kind: HelmRepository
metadata:
name: nexus-charts
namespace: flux-system
spec:
interval: 10m
url: https://charts.nexus-1.io
---
# Kustomization for module deployment
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: temperature-monitor
namespace: flux-system
spec:
interval: 10m
path: "./modules/temperature-monitor"
prune: true
sourceRef:
kind: GitRepository
name: nexus-modules
targetNamespace: nexus-modules
# Post-build variable substitution
postBuild:
substitute:
module_version: "2.1.0"
module_replicas: "3"
sampling_rate: "1000"
alert_threshold: "75.0"
substituteFrom:
- kind: ConfigMap
name: module-config
- kind: Secret
name: module-secrets
# Health checks
healthChecks:
- apiVersion: apps/v1
kind: Deployment
name: temperature-monitor
namespace: nexus-modules
# Dependencies
dependsOn:
- name: nexus-core
namespace: flux-system
---
# HelmRelease for module with Flux
apiVersion: helm.toolkit.fluxcd.io/v2beta1
kind: HelmRelease
metadata:
name: temperature-monitor
namespace: nexus-modules
spec:
interval: 10m
chart:
spec:
chart: nexus-module
version: "1.0.0"
sourceRef:
kind: HelmRepository
name: nexus-charts
namespace: flux-system
values:
module:
name: temperature-monitor
version: "2.1.0"
image:
repository: ghcr.io/myorg/temperature-monitor
tag: "2.1.0"
pullPolicy: IfNotPresent
replicas: 3
resources:
requests:
memory: "512Mi"
cpu: "500m"
limits:
memory: "1Gi"
cpu: "1000m"
config:
samplingRate: 1000
alertThreshold: 75.0
messageBufferSize: 1000
autoscaling:
enabled: true
minReplicas: 3
maxReplicas: 10
targetCPUUtilizationPercentage: 80
monitoring:
enabled: true
serviceMonitor:
enabled: true
interval: 30s
# Progressive delivery with Flagger
postRenderers:
- kustomize:
patches:
- target:
kind: Deployment
name: temperature-monitor
patch: |
apiVersion: apps/v1
kind: Deployment
metadata:
name: temperature-monitor
annotations:
flagger.app/config: |
analysis:
interval: 1m
threshold: 10
maxWeight: 50
stepWeight: 10
metrics:
- name: error-rate
templateRef:
name: error-rate
namespace: nexus-system
thresholdRange:
max: 1
interval: 1m
- name: latency
templateRef:
name: latency
namespace: nexus-system
thresholdRange:
max: 500
interval: 30s
# base/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: nexus-modules
resources:
- namespace.yaml
- deployment.yaml
- service.yaml
- configmap.yaml
- servicemonitor.yaml
commonLabels:
app.kubernetes.io/name: temperature-monitor
app.kubernetes.io/part-of: nexus-modules
app.kubernetes.io/managed-by: kustomize
configMapGenerator:
- name: temperature-monitor-config
files:
- config.yaml
options:
disableNameSuffixHash: true
secretGenerator:
- name: temperature-monitor-secrets
envs:
- secrets.env
type: Opaque
images:
- name: temperature-monitor
newName: ghcr.io/myorg/temperature-monitor
newTag: 2.1.0
replicas:
- name: temperature-monitor
count: 1
---
# base/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: temperature-monitor
spec:
selector:
matchLabels:
app.kubernetes.io/name: temperature-monitor
template:
metadata:
labels:
app.kubernetes.io/name: temperature-monitor
spec:
containers:
- name: module
image: temperature-monitor
ports:
- name: grpc
containerPort: 50051
- name: metrics
containerPort: 9090
env:
- name: NEXUS_MODULE_ID
value: temperature-monitor
- name: NEXUS_MODULE_VERSION
value: "2.1.0"
envFrom:
- configMapRef:
name: temperature-monitor-config
- secretRef:
name: temperature-monitor-secrets
resources:
requests:
memory: "256Mi"
cpu: "250m"
limits:
memory: "512Mi"
cpu: "500m"
livenessProbe:
grpc:
port: 50051
initialDelaySeconds: 10
readinessProbe:
grpc:
port: 50051
initialDelaySeconds: 5
---
# overlays/production/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: nexus-modules-production
resources:
- ../../base
replicas:
- name: temperature-monitor
count: 3
patches:
- path: deployment-patch.yaml
- path: resource-patch.yaml
- path: affinity-patch.yaml
patchesStrategicMerge:
- hpa.yaml
- pdb.yaml
configMapGenerator:
- name: temperature-monitor-config
behavior: merge
literals:
- SAMPLING_RATE=1000
- ALERT_THRESHOLD=75.0
- LOG_LEVEL=info
- METRICS_ENABLED=true
images:
- name: temperature-monitor
newTag: 2.1.0-stable
---
# overlays/production/deployment-patch.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: temperature-monitor
spec:
template:
spec:
containers:
- name: module
resources:
requests:
memory: "512Mi"
cpu: "500m"
limits:
memory: "1Gi"
cpu: "1000m"
env:
- name: ENVIRONMENT
value: production
- name: ENABLE_PROFILING
value: "false"
- name: MAX_CONNECTIONS
value: "1000"
---
# overlays/production/hpa.yaml
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: temperature-monitor
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: temperature-monitor
minReplicas: 3
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 80
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
- type: Pods
pods:
metric:
name: message_queue_depth
target:
type: AverageValue
averageValue: "100"
behavior:
scaleDown:
stabilizationWindowSeconds: 300
policies:
- type: Percent
value: 10
periodSeconds: 60
scaleUp:
stabilizationWindowSeconds: 60
policies:
- type: Percent
value: 50
periodSeconds: 60
- type: Pods
value: 2
periodSeconds: 60
Security and Compliance
Container Security
- Image Signing: All module images are signed with cosign
- SBOM Generation: Software Bill of Materials included with each release
- Vulnerability Scanning: Automated scanning in CI/CD pipeline
- Policy Enforcement: OPA policies for deployment validation
Monitoring and Observability
Track deployment metrics and module performance:
# Prometheus metrics exposed by deployment manager
nexus_deployment_total{module="temperature-monitor",environment="production",status="success"} 42
nexus_deployment_duration_seconds{module="temperature-monitor",quantile="0.99"} 120.5
nexus_deployment_rollback_total{module="temperature-monitor",reason="failed_analysis"} 2
nexus_canary_weight_percent{module="temperature-monitor",revision="abc123"} 25
# Grafana dashboard JSON available at:
# https://github.com/nexus-1/dashboards/blob/main/gitops-deployment.json
GitOps Best Practices
- Repository Structure: Separate app code from deployment configs
- Environment Promotion: Use PR-based promotion between environments
- Secret Management: Use Sealed Secrets or external secret operators
- Rollback Strategy: Maintain last 10 known-good configurations
- Monitoring: Set up alerts for sync failures and deployment issues
- Documentation: Document deployment procedures and rollback plans