Source code for shmpipeline.kernels.gpu.affine_transform
"""GPU affine transformation kernel."""
from __future__ import annotations
from typing import Any, Mapping
import torch
from shmpipeline.config import KernelConfig, SharedMemoryConfig
from shmpipeline.errors import ConfigValidationError
from shmpipeline.kernels.gpu.base import GpuKernel, as_gpu_tensor
[docs]
class AffineTransformGpuKernel(GpuKernel):
"""Apply an affine transform y = A x + b to a vector input."""
kind = "gpu.affine_transform"
auxiliary_arity = 2
[docs]
@classmethod
def validate_config(
cls,
config: KernelConfig,
shared_memory: Mapping[str, SharedMemoryConfig],
) -> None:
super().validate_config(config, shared_memory)
vector_spec = shared_memory[config.input]
matrix_spec = shared_memory[config.auxiliary_names[0]]
offset_spec = shared_memory[config.auxiliary_names[1]]
output_spec = shared_memory[config.output]
if len(vector_spec.shape) != 1:
raise ConfigValidationError(
f"kernel {config.name!r} requires a 1D input vector"
)
if len(matrix_spec.shape) != 2:
raise ConfigValidationError(
f"kernel {config.name!r} requires a 2D transform matrix"
)
if len(offset_spec.shape) != 1:
raise ConfigValidationError(
f"kernel {config.name!r} requires a 1D offset vector"
)
if len(output_spec.shape) != 1:
raise ConfigValidationError(
f"kernel {config.name!r} requires a 1D output vector"
)
if matrix_spec.shape[1] != vector_spec.shape[0]:
raise ConfigValidationError(
f"kernel {config.name!r} requires matrix columns to match input vector length"
)
if matrix_spec.shape[0] != offset_spec.shape[0]:
raise ConfigValidationError(
f"kernel {config.name!r} requires matrix rows to match offset vector length"
)
if output_spec.shape[0] != matrix_spec.shape[0]:
raise ConfigValidationError(
f"kernel {config.name!r} requires output vector length to match matrix rows"
)
dtypes = {
vector_spec.dtype,
matrix_spec.dtype,
offset_spec.dtype,
output_spec.dtype,
}
if len(dtypes) != 1:
raise ConfigValidationError(
f"kernel {config.name!r} requires matching dtypes across all affine inputs and outputs"
)
[docs]
def compute_into(
self,
trigger_input: Any,
output: Any,
auxiliary_inputs: Mapping[str, Any],
) -> None:
vector = as_gpu_tensor(trigger_input, device=self.device)
matrix = as_gpu_tensor(
auxiliary_inputs[self.context.config.auxiliary_aliases[0]],
device=self.device,
)
offset = as_gpu_tensor(
auxiliary_inputs[self.context.config.auxiliary_aliases[1]],
device=self.device,
)
torch.matmul(matrix, vector, out=output)
torch.add(output, offset, out=output)
torch.cuda.synchronize(output.device)