def __init__(self, config):
self.activation_model = None
self.layer_outputs = None
# READING FROM A YAML CONFIG FILE
self.info = config['TRAINING']['info']
self.noEpochs = config['TRAINING']['noEpochs']
self.batch_size = config['TRAINING']["batch_size"]
self.nNeurons = config['TRAINING']["nNeurons"]
self.validation_split = config['TRAINING']["validation_split"]
# self.loss = config['TRAINING']["loss"]
self.h5_file_loc = config['TRAINING']["h5file_loc"] # Location to save the trained model
self.pacmap_file_loc = config['VISUALIZATION'][
"pacmap_file_loc"] # Location to save the Dimensionality Reduction results
self.pcache_file_loc = config['VISUALIZATION']["pcache_file_loc"]
self.image_file_loc = config['VISUALIZATION']["image_file_loc"]
def load_data(features_csv: str, labels_csv: str, key_column: str, win_size: int) -> tuple[ndarray[Any, Any], Any]:
Loads features and labels from separate CSV files, merges them on a common key, reshapes the features
into sequences, and one-hot encodes the labels. The function also saves the label encoding to a YAML file.
features_csv (str): Path to the CSV file containing the feature data.
labels_csv (str): Path to the CSV file containing the label data.
key_column (str): The column name used as the key for merging the features and labels.
win_size (int): The window size for reshaping the data into sequences of time steps.
Tuple[pd.DataFrame, pd.DataFrame]:
- X_reshaped (pd.DataFrame): A 3D array of reshaped feature data with dimensions
[number_of_sequences, win_size, number_of_features].
- y_one_hot (pd.DataFrame): A 2D array of one-hot encoded labels with dimensions
[number_of_sequences, number_of_classes].
ValueError: If there is an issue with the merging or reshaping of data.
1. Loads the features and labels from the provided CSV files.
2. Merges the two datasets based on the specified key column.
3. Reshapes the features into sequences of `win_size` time steps.
4. Extracts labels, encodes them using a label encoder, and one-hot encodes the labels.
5. Saves the mapping of original label names to their encoded form in a `model_labels.yaml` file.
# Load features and labels
features_df = pd.read_csv(features_csv)
labels_df = pd.read_csv(labels_csv)
# Merge features and labels on the key column
combined_df = pd.merge(features_df, labels_df, on=key_column)
# Adjust 'Labels' to your actual label column name
columns=[combined_df.columns[len(combined_df.columns) - 2],
# Reshape X to have sequences of [win_size] timesteps: [number_of_sequences, win_size, number_of_features]
number_of_features = X.shape[1]
number_of_sequences = X.shape[0] // win_size
print("Number of features: ", number_of_features)
print("number of sequences: ", number_of_sequences)
# X_reshaped = X.reshape((number_of_sequences, 90, number_of_features))
X_reshaped = X.reshape((number_of_sequences, win_size, number_of_features))
print("Shape of X:", X_reshaped.shape) # Debugging line to check the shape of X
# Extract labels, taking one label for every win_size timesteps
y = combined_df['Labels'].values[::win_size] # Adjust 'Labels' to your label column name
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_one_hot = to_categorical(y_encoded)
print("Shape of Y:", y_one_hot.shape)
# To get the list of original labels in the order used during encoding:
original_labels_order = label_encoder.classes_
# Create the labels data structure for YAML
'labels': {f'{i}': label for i, label in enumerate(original_labels_order)}
with open('model_labels.yaml', 'w') as file:
yaml.dump(labels_dict, file, sort_keys=False, default_flow_style=False)
return X_reshaped, y_one_hot
def split_data(X, y, validation_split: float = 0.2):
N = int(1 / validation_split) # Define the interval (1 validation sample for every N training samples)
number_of_sequences = X.shape[0]
# Loop over all the sequences
for i in range(number_of_sequences):
X_val.append(X[i]) # Validation data
X_train.append(X[i]) # Training data
X_train = np.array(X_train)
y_train = np.array(y_train)
print("Training data shape:", X_train.shape)
print("Validation data shape:", X_val.shape)
print("Training labels shape:", y_train.shape)
print("Validation labels shape:", y_val.shape)
return X_train, X_val, y_train, y_val
def create_model(self, input_shape, num_classes):
Creates and compiles the LSTM model.
input_shape (tuple): Shape of the input data (time steps, features).
num_classes (int): Number of classes in the dataset.
# A Sequential model in Keras is a linear stack of layers
self.model = Sequential()
# The requirements to use the cuDNN implementation are:
# ------------------------------------------------------
# https://www.tensorflow.org/api_docs/python/tf/keras/layers/LSTM
# recurrent_activation == sigmoid
# An LSTM layer is added to the model as the first layer
kernel_regularizer=l2(0.01),
recurrent_regularizer=l2(0.1),
recurrent_dropout=0.0, # needs to be 0 for cuDNN
self.model.add(TimeDistributed(BatchNormalization()))
# Another LSTM layer is added to the model
self.model.add(LSTM(self.nNeurons,
kernel_regularizer=l2(0.01),
recurrent_regularizer=l2(0.1),
recurrent_dropout=0.0, # needs to be 0 for cuDNN
self.model.add(BatchNormalization())
self.model.add(Dense(num_classes, activation='softmax'))
# Compile the model with Nadam optimizer and a learning rate scheduler
optimizer = Nadam(learning_rate=0.0006)
self.model.compile(optimizer=optimizer, loss=categorical_crossentropy_cpu,
metrics=['accuracy']) # , loss='categorical_crossentropy', metrics=['accuracy'])
# prints a summary representation of the model, showing the layout of the layers,
# the shape of the output from each layer, and the number of parameters (weights and biases) in each layer
def train_model(self, X, y):
X (numpy array): Feature data.
y (numpy array): One-hot encoded labels.
# This out-of-the box method is not good because it shuffles our data
#X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=self.validation_split, random_state=42)
# We wrote a custom method that keeps one validation sample every N samples
X_train, X_val, y_train, y_val = LSTMNeuralNet.split_data(X, y, self.validation_split)
history = self.model.fit(X_train, y_train,
validation_data=(X_val, y_val),
batch_size=self.batch_size,
callbacks=[early_stopping, lr_scheduler])
plot_training_history(history, self.h5_file_loc.replace(".h5", ".jpg"))
self.model.save(self.h5_file_loc)
print(f"Model saved at {self.h5_file_loc}")
def run_pacmap(trained_model, X):
# Step 2: Prepare the activation_model
# Assuming you want the activations from the last LSTM layer
lstm_layer_output = trained_model.layers[-2].output # Adjust index as needed
activation_model = Model(inputs=trained_model.input, outputs=lstm_layer_output)
# Step 3: Load and preprocess data
# X, y = self.load_data(features_csv, labels_csv, key_column, win_size)
# Step 4: Visualize Latent Space
activations = activation_model.predict(X)
# initializing the pacmap instance
# Setting n_neighbors to "None" leads to an automatic choice shown below in "parameter" section
embedding = pacmap.PaCMAP(n_components=3, n_neighbors=None, MN_ratio=0.5, FP_ratio=2.0, apply_pca=True,
reduced_activations = embedding.fit_transform(activations.reshape(activations.shape[0], -1), init="random")
return embedding, reduced_activations
def run_pacmap_from_file(features_csv, labels_csv, key_column, model_loc, win_size):
trained_model = load_model(model_loc)
X, y = LSTMNeuralNet.load_data(features_csv, labels_csv, key_column, win_size)
return LSTMNeuralNet.run_pacmap(trained_model, X)
def visualize_pacmap(info: str, reduced_activations: np.ndarray, class_indices: Union[np.ndarray, List[int]],
def load_labels_from_yaml(yaml_file_path: str):
with open(yaml_file_path, 'r') as file:
config = yaml.safe_load(file)
return {int(key): value for key, value in config['MODEL']['labels'].items()}
fig = plt.figure(figsize=(10, 8))
plt.style.use('dark_background')
ax = fig.add_subplot(111, projection='3d')
ax.xaxis.pane.fill = False
ax.yaxis.pane.fill = False
ax.zaxis.pane.fill = False
# Extracting the reduced dimensions
x = reduced_activations[:, 0]
y = reduced_activations[:, 1]
z = reduced_activations[:, 2]
scatter = ax.scatter(x, y, z, c=class_indices, cmap='RdYlBu', alpha=0.2, s=2)
# Create legend: Map class indices to colors and words
unique_classes = sorted(set(class_indices))
colors = scatter.cmap(scatter.norm(unique_classes))
class_to_word = load_labels_from_yaml("model_labels.yaml")
# Create custom patches for the legend
legend_patches = [mpatches.Patch(color=colors[i], label=class_to_word[unique_classes[i]])
for i in range(len(unique_classes))]
# Add the legend to the plot
plt.legend(handles=legend_patches, loc='upper right', title='Classes')
plt.savefig(output_file_path, dpi=150)
def export_pcache(reduced_activations: np.ndarray, class_indices: Union[np.ndarray, List[int]],
output_file_path: str) -> None:
Generates and saves a .pcache file from the reduced activation data and class indices.
reduced_activations (np.ndarray): A 2D NumPy array of shape (n_samples, 3) representing the reduced
dimensionality data (e.g., PacMAP or other embeddings).
class_indices (Union[np.ndarray, List[int]]): A 1D NumPy array or list of class indices for each sample.
These will be used to set the color values in the .pcache file.
output_file_path (str): The path where the .pcache file will be saved.
The file extension should be '.pcache'.
None. Writes the formatted data to a .pcache file.
reduced_activations = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]])
class_indices = np.array([0, 1, 2])
LSTMNeuralNet.export_pcache(reduced_activations, class_indices, 'output.pcache')
id_count = float(len(class_indices) - 1)
# Processing each row in the reduced_activations and class_indices
[float(reduction[0]), # Pos X
float(reduction[1]), # Pos Y
float(reduction[2]), # Pos Z
float(class_idx) / id_count, # R channel
float(class_idx) / id_count, # G channel
float(class_idx) / id_count, # B channel
for reduction, class_idx in zip(reduced_activations, class_indices)
# Header for the pcache file
"comment [Uncharted Limbo Collective] pcache file generated from PacMAP Dimensionality Reduction data",
f"elements {len(pcache_data)}",
"property float position.x",
"property float position.y",
"property float position.z",
"property float color.r",
"property float color.g",
"property float color.b",
"property float color.a",
# Combining header and data into the final pcache content
final_content = "\n".join(pcache_header) + "\n"
final_content += "\n".join([" ".join(map(str, row)) for row in pcache_data])
# Writing the output to a .pcache file
with open(output_file_path, 'w') as f:
print(f"File saved to {output_file_path}")
def run(self, features_csv: str, labels_csv: str, key_column: str, win_size: int):
print(f"Loading Dataset...")
X, y = LSTMNeuralNet.load_data(features_csv, labels_csv, key_column, win_size)
input_shape = (X.shape[1], X.shape[2]) # LSTM expects input as [samples, time steps, features]
class_indices = np.argmax(y, axis=1) # Convert one_hot class encoding to integer
print(f"Building LSTM model...")
self.create_model(input_shape, num_classes)
print(f"Training LSTM model...")
print(f"Saving LSTM model to disk...")
print(f"Running Dimensionality Reduction using PacMAP")
embedding, reduced_activations = LSTMNeuralNet.run_pacmap(self.model, X)
print(f"Saving PacMAP embedding")
pacmap.save(embedding, self.pacmap_file_loc)
print(f"Saving png for reference")
LSTMNeuralNet.visualize_pacmap(self.info, reduced_activations, class_indices, self.image_file_loc)
print(f"Saving .pcache for Unity")
LSTMNeuralNet.export_pcache(reduced_activations, class_indices, self.pcache_file_loc)
del embedding, reduced_activations