Skip to content

API Reference

Every public class and method in Neon, documented.


Model

NeonVLA

The complete Vision-Language-Action model. The pipeline from pixels to joint commands.

from neon.model.neon_vla import NeonVLA, NeonConfig

model = NeonVLA(config)
model.load_backbone()

Constructor: NeonVLA(config: NeonConfig)

Method Args Returns What It Does
load_backbone() Download and load the video backbone
forward(images, video_frames, text, proprioception, audio) See below (batch, steps, action_dim) Full forward pass
predict(image, video_frames, instruction, proprioception, audio, speak) See below NeonOutput Inference with post-processing
compute_loss(images, video_frames, text, proprioception, target_actions, action_mask, audio) See below Dict[str, Tensor] Training loss computation
save_pretrained(path) str Save config + trainable weights (~25 MB)
from_pretrained(path, load_backbone) str, bool NeonVLA Load a saved model

NeonConfig

@dataclass
class NeonConfig:
    backbone: BackboneConfig          # Video backbone configuration
    control_mode: str = "arms_only"   # "arms_only" | "upper_body" | "whole_body"
    include_locomotion: bool = True   # Add 3 DoF velocity output
    num_action_steps: int = 16        # Action chunk size
    mlp_hidden: int = 512             # Action head MLP width
    action_head_layers: int = 3       # Action head depth
    action_head_dropout: float = 0.1
    use_separate_heads: bool = True   # Per-group decoders
    use_action_chunking: bool = True  # Temporal prediction
    use_proprioception: bool = True   # Include joint state encoder
    proprioception_hidden: int = 128
    learning_rate: float = 2e-4
    weight_decay: float = 0.01
    warmup_ratio: float = 0.1
    audio: Optional[Dict] = None      # AudioConfig as dict, or None

NeonOutput

@dataclass
class NeonOutput:
    actions: np.ndarray           # (steps, action_dim) normalized [-1, 1]
    raw_actions: np.ndarray       # (steps, action_dim) in physical units
    upper_body: np.ndarray        # (steps, 14) arm joint positions
    locomotion: np.ndarray        # (steps, 3) vx, vy, ω
    gripper: np.ndarray           # (steps, 2) gripper positions
    speech_path: Optional[str]    # Path to generated speech WAV

Video Backbone

VideoBackbone

from neon.model.video_backbone import VideoBackbone, BackboneConfig

backbone = VideoBackbone(BackboneConfig(model_id="Qwen/Qwen2.5-VL-3B-Instruct"))
backbone.load()
result = backbone.encode(images=[img], text="Pick up the cup")
Method Returns What
load() Load model weights from HuggingFace
encode(images, video_frames, text) Dict[hidden_states, pooled] Encode inputs to features
forward(images, video_frames, text) Tensor Returns pooled features

Properties: hidden_size: int — Output dimension (auto-detected from model).


Action Heads

G1ActionHead

Orchestrates per-group action decoders for the G1 humanoid.

from neon.model.action_heads import G1ActionHead, ActionHeadConfig

head = G1ActionHead(config, action_space)
actions = head(features)  # (batch, 16, 17)
loss = head.compute_loss(predicted, target, mask)

ActionChunkingHead

Predicts temporal action sequences with learnable step embeddings.

from neon.model.action_heads import ActionChunkingHead

head = ActionChunkingHead(input_dim=2048, action_dim=14, num_steps=16)
actions = head(features)  # (batch, 16, 14)

MLPDecoder

Parameter Golf v2 MLP with ReLU², RMSNorm, skip connections, and soft-capping.

from neon.model.action_heads import MLPDecoder

decoder = MLPDecoder(input_dim=2048, output_dim=3, hidden_dim=256)
actions = decoder(features)  # (batch, 3)

FlowMatchingHead

Flow-matching head for multi-modal action distributions. Derived from π₀ and rectified flow.

from neon.model.action_heads import FlowMatchingHead

head = FlowMatchingHead(
    input_dim=2048, action_dim=17, num_steps=16,
    hidden_dim=512, num_layers=4, num_denoise_steps=4,
    soft_cap_value=1.0,
    noise_beta_alpha=0.0,         # Optional Beta noise scheduling
    use_cross_attention=False,    # Optional cross-attention to backbone
)

# Training
loss = head.compute_loss(features, target_actions, mask)

# Inference (Euler integration)
actions = head.sample(features, num_denoise_steps=4)  # (batch, 16, 17)
Method Args Returns What
forward(features) (batch, input_dim) (batch, steps, action_dim) Forward pass (training mode)
compute_loss(features, targets, mask) Tensors Dict[str, Tensor] Flow matching loss
sample(features, num_denoise_steps) Tensor, int (batch, steps, action_dim) Euler integration from noise

DiTActionHead

Diffusion Transformer with adaLN conditioning, inspired by GR00T N1.6.

from neon.model.action_heads import DiTActionHead

head = DiTActionHead(
    input_dim=2048, action_dim=17, num_steps=16,
    hidden_dim=384, num_layers=8, num_heads=6,
    num_denoise_steps=4, soft_cap_value=1.0,
)

loss = head.compute_loss(features, target_actions, mask)
actions = head.sample(features, num_denoise_steps=4)

DiTBlock

Single Diffusion Transformer block with adaLN, self-attention, optional cross-attention, and MLP.

from neon.model.action_heads import DiTBlock

block = DiTBlock(hidden_dim=384, num_heads=6, dropout=0.1, cross_attention_dim=2048)

StateRelativeHead

Wrapper that predicts Δ from current state. Zero extra parameters.

from neon.model.action_heads import StateRelativeHead, ActionChunkingHead

inner = ActionChunkingHead(input_dim=2048, action_dim=17, num_steps=16)
head = StateRelativeHead(inner_head=inner, action_dim=17, num_steps=16)
actions = head(features, current_state=joint_positions)

EnsembleHead

Gated ensemble of MLP + FlowMatching + DiT heads. ~13M params total.

from neon.model.action_heads import EnsembleHead

head = EnsembleHead(
    input_dim=2048, action_dim=17, num_steps=16,
    mlp_hidden=512, flow_hidden=512, dit_hidden=384,
    dit_layers=8, dit_heads=6, flow_layers=4,
    num_denoise_steps=4,
)
actions = head(features)  # Gated weighted sum of three heads

Audio

AudioEncoder

from neon.model.audio import AudioEncoder, AudioConfig

encoder = AudioEncoder(AudioConfig(encoder_type="whisper"), backbone_hidden_size=2048)
encoder.load()
features = encoder.encode(audio_tensor)  # (batch, 2048)

SpeechResponseHead

from neon.model.audio import SpeechResponseHead

head = SpeechResponseHead(hidden_size=2048)
output = head(features)
# output["response_type"]: (batch, 5) — narrate/confirm/warn/ask/silent
# output["token_logits"]: (batch, vocab_size)

PersonaPlexSpeaker

from neon.model.audio import PersonaPlexSpeaker, AudioConfig

speaker = PersonaPlexSpeaker(AudioConfig(personaplex_voice="NATM1"))
path = speaker.speak("Reaching for the cup")  # → "/tmp/neon_speech_xxx.wav"

Action Space

G1ActionSpace

from neon.data.action_space import G1ActionSpace, ControlMode

space = G1ActionSpace(mode=ControlMode.ARMS_ONLY, include_locomotion=True)
Property Type What
action_dim int Total output dimensions
num_joints int Number of controlled joints
active_joints List[JointInfo] Joints in current mode
joint_names List[str] Active joint names
Method Args Returns What
normalize(actions) ndarray ndarray Raw → [-1, 1]
denormalize(actions) ndarray ndarray [-1, 1] → raw
split_action(action) ndarray Dict[str, ndarray] Split by joint group
default_position() ndarray Standing pose
get_group_indices(group) str List[int] Indices for a group

Data

DataSoupDataset

from neon.data.data_soup import DataSoupDataset, DataSoupConfig

dataset = DataSoupDataset(config, action_space)
sample = dataset[0]
# sample["image"], sample["target_actions"], sample["language"], ...

Supported source types: lerobot, agibot, dreamgen, cosmos_dreamgen, oxe, voice_commands, stereo4d, g1_teleop, groot_teleop, neon_v3, neon_native, bones_seed, kimodo, molmobot

ActionMapper

from neon.data.data_soup import ActionMapper

mapper = ActionMapper(target_space=action_space)
g1_actions = mapper.map_actions(franka_actions, "franka")

Inference

NeonInferenceServer

from neon.inference.server import NeonInferenceServer

server = NeonInferenceServer("cagataydev/neon-g1-v1")
output = server.predict(image=frame, instruction="Pick up the cup")
server.reset()  # Clear smoothing state

G1Controller

from neon.inference.g1_controller import G1Controller, G1Config

controller = G1Controller(G1Config(robot_ip="192.168.123.10"))
controller.connect()
controller.run_control_loop(model, instruction="Pick up the cup", max_steps=200)
controller.disconnect()

Training

NeonTrainer

from neon.training.train import NeonTrainer
from neon.training.config import TrainConfig

trainer = NeonTrainer(TrainConfig())
stats = trainer.train()

Presets

from neon.training.config import (
    default_arms_only_config,    # 7M, Omni-7B, arms_only
    default_wholebody_config,    # 10M, Omni-7B, whole_body
    cosmos_physics_config,       # 8M, Cosmos-8B, arms_only
    edge_3b_config,              # 2M, Omni-3B, arms_only (Jetson)
    large_arms_config,           # 44M, Omni-7B, arms_only (A100)
    large_cosmos_config,         # 44M, Cosmos-8B, arms_only (A100)
    large_wholebody_config,      # 55M, Omni-7B, whole_body (A100 80GB)
    g1_omnimodal_config,         # whole_body + LiDAR + EEF + audio (A100)
)

Policy

NeonPolicy

strands-robots compatible VLA policy with RTC action queue and omni-modal support.

from neon import NeonPolicy

policy = NeonPolicy(
    host="192.168.123.10",
    port=8300,
    protocol="http",           # or "zmq"
    blend_schedule="linear",   # "linear", "step", "exponential"
    chunk_size=16,
    action_dim=17,
    speak=False,               # Enable PersonaPlex speech output
)

# Set robot joint keys for action dict construction
policy.set_robot_state_keys(["left_hip_pitch", "left_hip_roll", ...])

# Async usage
actions = await policy.get_actions(observation_dict, "pick up the cup")

# Sync usage
actions = policy.get_actions_sync(observation_dict, "pick up the cup")

# Health check — discover supported modalities
health = await policy.health_check()
print(health["modalities"])
# {"camera": true, "audio": true, "lidar": false, ...}

Observation keys (automatically mapped to server fields):

Key Pattern Server Field Type
observation.images.* image_base64 / video_frames_base64 (H,W,3) uint8
observation.state proprioception (N,) float32
observation.audio audio (N,) float32 16kHz
observation.lidar lidar (N,4) float32
observation.eef_state eef_state (14,) float32

Encoders

PointCloudEncoder

PointNet-style encoder for LiDAR point clouds. Shared MLPs + max-pool.

from neon.model.neon_vla import PointCloudEncoder

encoder = PointCloudEncoder(input_dim=4, hidden_dim=128, output_dim=256)
features = encoder(points)  # (batch, N, 4) → (batch, 256)

EEFEncoder

MLP encoder for bimanual end-effector state (position + quaternion).

from neon.model.neon_vla import EEFEncoder

encoder = EEFEncoder(input_dim=14, hidden_dim=64, output_dim=128)
features = encoder(eef_state)  # (batch, 14) → (batch, 128)

ProprioceptionEncoder

MLP encoder for current joint states.

from neon.model.neon_vla import ProprioceptionEncoder

encoder = ProprioceptionEncoder(input_dim=17, hidden_dim=128, output_dim=256)
features = encoder(joint_states)  # (batch, 17) → (batch, 256)

Synthetic Data Pipeline

IDMExtractor

Inverse Dynamics Model action extraction from video. Uses NVIDIA's GR00T-Dreams IDM (seonghyeonye/IDM_gr1) to extract plausible joint actions from Cosmos-generated videos.

from neon.synth.idm_extractor import IDMExtractor, IDMConfig

extractor = IDMExtractor(IDMConfig(
    checkpoint="seonghyeonye/IDM_gr1",
    embodiment="gr1",           # gr1=44-DoF, franka=7-DoF, so100=6-DoF
    action_dim=44,
))

# Extract actions from video frames
actions = extractor.extract(video_frames)  # (T-1, 44) joint actions between frames

CosmosAugmentor

Cosmos-Predict2.5 video augmentation — generates photorealistic variants of demonstration videos.

from neon.synth.cosmos_augmentor import CosmosAugmentor

augmentor = CosmosAugmentor(resolution=256, num_steps=4)
augmented_video = augmentor.augment(input_video, prompt="robot picking up cup in kitchen")

KimodoGenerator

Procedural kinematic motion generation for bootstrapping training data.

from neon.synth.kimodo_generator import KimodoGenerator

generator = KimodoGenerator()
trajectories = generator.generate(task="reach_and_grasp", num_episodes=1000)

WorldGenerator

3D world generation for sim environments via the Marble Pipeline.

from neon.synth.world_generator import WorldGenerator

generator = WorldGenerator()
scene = generator.generate(prompt="kitchen counter with cups and plates")

Photon Inference Engine

SpeculativeActionDecoder

Draft cheap actions with a small MLP, verify/refine with the full Flow/DiT head. Saves ~40% inference time on simple motions.

from neon.model.photon import SpeculativeActionDecoder

spec_decoder = SpeculativeActionDecoder(
    draft_head=mlp_head,
    verify_head=flow_head,
    acceptance_threshold=0.15,  # Accept draft if error < threshold
)
actions = spec_decoder(features)

AdaptiveComputeRouter

Routes simple actions (stationary, slow reach) to fast MLP path, complex actions (dynamic grasp) to Flow/DiT.

from neon.model.photon import AdaptiveComputeRouter

router = AdaptiveComputeRouter(
    input_dim=2048,
    routes=[
        ComputeRoute("fast", mlp_head, compute_cost=1.0),
        ComputeRoute("medium", flow_head, compute_cost=4.0),
        ComputeRoute("heavy", dit_head, compute_cost=8.0),
    ],
)
actions = router(features)  # Automatically selects optimal route

PhotonInferenceEngine

Full Photon engine combining adaptive routing + speculative decoding + torch.compile.

from neon.model.photon import PhotonInferenceEngine

engine = PhotonInferenceEngine(model, device="cuda")
actions = engine.predict(image, instruction, proprio)

Physics-Aware Guidance (AirVLA)

Inject gradient corrections into flow-matching sampling at inference time — steer toward physically feasible actions WITHOUT modifying model weights.

PayloadAwareGuidance

Compensate for payload effects (e.g., altitude sag when gripping).

from neon.model.guidance import PayloadAwareGuidance

guidance = PayloadAwareGuidance(
    altitude_index=2,
    gripper_index=4,
    target_altitude_offset=0.05,
)
actions = flow_head.sample(features, guidance_fn=guidance, guidance_scale=2.0)

JointTorqueSafetyGuidance

Enforce joint torque limits during generation.

from neon.model.guidance import JointTorqueSafetyGuidance
guidance = JointTorqueSafetyGuidance(action_space=g1_action_space)

CollisionAvoidanceGuidance

Steer away from self-collisions or known obstacles.

from neon.model.guidance import CollisionAvoidanceGuidance
guidance = CollisionAvoidanceGuidance(obstacle_positions=[(0.3, 0.1, 0.5)])

SmoothTrajectoryGuidance

Penalize jerk (third derivative) for smooth robot motion.

from neon.model.guidance import SmoothTrajectoryGuidance
guidance = SmoothTrajectoryGuidance(jerk_weight=0.1)

CompositeGuidance

Compose multiple guidance functions with individual scales.

from neon.model.guidance import CompositeGuidance

guidance = CompositeGuidance([
    (PayloadAwareGuidance(...), 2.0),
    (CollisionAvoidanceGuidance(...), 1.0),
    (SmoothTrajectoryGuidance(), 0.5),
])
actions = flow_head.sample(features, guidance_fn=guidance, guidance_scale=1.0)

Additional Sensor Encoders

GPSEncoder

from neon.model.neon_vla import GPSEncoder
encoder = GPSEncoder(input_dim=6, hidden_dim=64, output_dim=128)
features = encoder(gps_data)  # (batch, 6) → (batch, 128)

DepthEncoder

from neon.model.neon_vla import DepthEncoder
encoder = DepthEncoder(output_dim=256)
features = encoder(depth_map)  # (batch, 1, H, W) → (batch, 256)

SegmentationEncoder

from neon.model.neon_vla import SegmentationEncoder
encoder = SegmentationEncoder(num_classes=20, output_dim=256)
features = encoder(seg_map)  # (batch, H, W) → (batch, 256)

TactileEncoder

from neon.model.neon_vla import TactileEncoder
encoder = TactileEncoder(num_links=14, force_dim=6, output_dim=128)
features = encoder(tactile_data)  # (batch, 14, 6) → (batch, 128)

IMUEncoder

from neon.model.neon_vla import IMUEncoder
encoder = IMUEncoder(input_dim=9, hidden_dim=64, output_dim=128)
features = encoder(imu_data)  # (batch, 9) → (batch, 128)

ForceEncoder

from neon.model.neon_vla import ForceEncoder
encoder = ForceEncoder(input_dim=6, hidden_dim=64, output_dim=128)
features = encoder(force_data)  # (batch, 6) → (batch, 128)

Evaluation Suites

LIBEROEvaluator

LIBERO 4-suite benchmark (Spatial, Object, Goal, Long). Compares vs GR00T N1.6 (87.5%) and π₀.₅ (SOTA).

from neon.eval.libero_eval import LIBEROEvaluator, LIBEROConfig

evaluator = LIBEROEvaluator(LIBEROConfig(
    model_path="cagataydev/neon-g1-v1",
    suites=["spatial", "object", "goal", "long"],
    num_rollouts=20,
))
results = evaluator.evaluate()
print(results.summary())

RoboCasaEvaluator

RoboCasa zero-shot kitchen manipulation evaluation.

from neon.eval.robocasa_eval import RoboCasaEvaluator, RoboCasaConfig

evaluator = RoboCasaEvaluator(RoboCasaConfig(model_path="cagataydev/neon-g1-v1"))
results = evaluator.evaluate()

SimplerEnvEvaluator

SimplerEnv suite (Google RT-2 benchmark environments).

from neon.eval.simplerenv_eval import SimplerEnvEvaluator, SimplerEnvConfig

evaluator = SimplerEnvEvaluator(SimplerEnvConfig(model_path="cagataydev/neon-g1-v1"))
results = evaluator.evaluate()

Simulation

Simulation

Self-contained MuJoCo simulation class supporting URDF/MJCF loading, multi-robot scenes, domain randomization, and trajectory recording.

from neon.sim.simulation import Simulation

sim = Simulation()
sim.create_world()
sim.add_robot(urdf_path="g1.urdf", name="g1", position=[0, 0, 0])
sim.add_object(shape="box", name="cube", size=[0.05, 0.05, 0.05], position=[0.3, 0, 0.55])
obs = sim.get_observation("g1")
sim.step(actions)

NewtonBackend

Newton GPU-accelerated backend — 4096+ parallel environments on a single GPU with differentiable simulation.

from neon.sim.newton.newton_backend import NewtonBackend, NewtonConfig

config = NewtonConfig(num_envs=4096, solver="featherstone", device="cuda:0")
backend = NewtonBackend(config)
backend.create_world()
backend.add_robot("g1", urdf_path="g1.urdf")
backend.replicate(num_envs=4096)

Morphology (GulaMannen)

HumanoidMorphology

Modular humanoid morphology system with 6 swappable configurations for sim-first design exploration.

from neon.sim.morphology import HumanoidMorphology, MorphologyConfig

morph = HumanoidMorphology(MorphologyConfig(
    variant="series_biped",  # or "parallel_joints", "pedestal_hybrid", etc.
))

Configurations: series_biped (30 DoF), parallel_joints, pedestal_hybrid, vertical_actuator, single_leg, linear_rail_tower.


Training Utilities

SelfLearner

Online self-supervised adaptation engine — improves action heads during deployment without human labels.

from neon.training.self_learner import SelfLearner

learner = SelfLearner(model, config)
learner.step(observation_before, actions, observation_after)

Training Presets

from neon.training.config import (
    # Standard
    default_arms_only_config,    # 7M, Omni-7B, arms_only
    default_wholebody_config,    # 10M, Omni-7B, whole_body
    cosmos_physics_config,       # 8M, Cosmos-8B, arms_only
    edge_3b_config,              # 2M, Omni-3B, arms_only (Jetson)
    # Large
    large_arms_config,           # 44M, Omni-7B, arms_only (A100)
    large_cosmos_config,         # 44M, Cosmos-8B, arms_only (A100)
    large_wholebody_config,      # 55M, Omni-7B, whole_body (A100 80GB)
    # Advanced heads
    flow_arms_only_config,       # FlowMatching head
    dit_arms_only_config,        # DiT head
    flow_wholebody_config,       # Flow + whole body
    # Omni-modal
    g1_omnimodal_config,         # All sensors + whole body
    synth_omnimodal_config,      # Synthetic data pipeline
    omnimodal_9_config,          # 9 modalities
    # Named presets
    neon_nano_config,            # Cheapest baseline
    neon_base_flow_config,       # Standard quality (flow)
    neon_dit_config,             # DiT 8-layer head
    neon_cosmos_flow_config,     # Physics + flow
    neon_ensemble_config,        # Gated MLP+Flow+DiT
    neon_omni_12_config,         # ALL 14 modalities
    neon_wholebody_flow_config,  # 29DoF + BONES-SEED
)