Process time series data and validate/infer timestamp parameters.
This function:
1. Creates a timestamp column if one doesn't exist
2. Validates the timestamp column exists and has no missing values
3. Sorts the data by timestamp
4. Infers timestamp_format from the data
5. Validates or infers timestamp_interval_seconds
6. Sets start_timestamp and stop_timestamp
Parameters:
| Name |
Type |
Description |
Default |
training_df
|
DataFrame
|
|
required
|
config
|
SafeSynthesizerParameters
|
The configuration object with time_series settings
|
required
|
Returns:
Raises:
| Type |
Description |
ParameterError
|
If the timestamp column is missing, if timestamp_format="elapsed_seconds"
is set on a non-numeric column, or if an explicit format fails to parse the data.
|
DataError
|
If the timestamp column has missing values or intervals are inconsistent.
|
Source code in src/nemo_safe_synthesizer/training/timeseries_preprocessing.py
| def process_timeseries_data(
training_df: pd.DataFrame,
config: SafeSynthesizerParameters,
) -> tuple[pd.DataFrame, SafeSynthesizerParameters]:
"""Process time series data and validate/infer timestamp parameters.
This function:
1. Creates a timestamp column if one doesn't exist
2. Validates the timestamp column exists and has no missing values
3. Sorts the data by timestamp
4. Infers timestamp_format from the data
5. Validates or infers timestamp_interval_seconds
6. Sets start_timestamp and stop_timestamp
Args:
training_df: The training DataFrame.
config: The configuration object with time_series settings
Returns:
Tuple of (processed DataFrame, updated config)
Raises:
ParameterError: If the timestamp column is missing, if ``timestamp_format="elapsed_seconds"``
is set on a non-numeric column, or if an explicit format fails to parse the data.
DataError: If the timestamp column has missing values or intervals are inconsistent.
"""
ts_config = config.time_series
# Step 1: Add pseudo-group if needed
training_df, group_by_col = _add_pseudo_group_if_needed(training_df, config)
if group_by_col is None:
raise RuntimeError("group_by_col should have been set by _add_pseudo_group_if_needed")
# Step 2: Create elapsed time column if timestamp not provided
training_df, is_elapsed_time = _create_elapsed_time_column(training_df, ts_config, group_by_col)
# timestamp_column should be set by now
if ts_config.timestamp_column is None:
raise RuntimeError("timestamp_column should have been set by _create_elapsed_time_column")
config.data.order_training_examples_by = ts_config.timestamp_column
# Step 3: Validate timestamp column -- run before any dtype checks so a missing
# column raises ParameterError with actionable guidance rather than KeyError.
_validate_timestamp_column(training_df, ts_config.timestamp_column)
if not is_elapsed_time:
is_elapsed_time = _detect_elapsed_seconds_format(training_df, ts_config)
# Step 4: Sort by group and timestamp
training_df = _sort_by_group_and_timestamp(training_df, group_by_col, ts_config.timestamp_column)
# Step 5: Infer format and convert to datetime (if not elapsed time)
# Skip datetime conversion for elapsed_seconds format (either created or user-provided)
if not is_elapsed_time and ts_config.timestamp_format != "elapsed_seconds":
training_df = _infer_and_convert_timestamp_format(training_df, ts_config)
# Step 6: Process groups and validate consistency
ts_config = _process_grouped_timestamps(training_df, ts_config, group_by_col, is_elapsed_time)
# Step 7: Convert timestamp back to string format
# Skip string conversion for elapsed_seconds format (values are already numeric)
if (
not is_elapsed_time
and ts_config.timestamp_format is not None
and ts_config.timestamp_format != "elapsed_seconds"
):
training_df[ts_config.timestamp_column] = training_df[ts_config.timestamp_column].dt.strftime(
ts_config.timestamp_format
)
return training_df, config
|