Quick Start Guide

This guide will get you up and running with EvoAug2 in just a few minutes. You’ll learn how to apply evolution-inspired augmentations to genomic sequences and integrate them into your PyTorch training pipeline.

Basic Usage

Import the required modules:

import torch
from evoaug.augment import RandomMutation, RandomDeletion, RandomTranslocation
from evoaug.evoaug import RobustLoader

Create augmentation objects:

# Single nucleotide mutations (5% mutation rate)
mutation = RandomMutation(mut_frac=0.05)

# Random deletions (0-30 nucleotides)
deletion = RandomDeletion(delete_min=0, delete_max=30)

# Random translocations (0-20 nucleotide shifts)
translocation = RandomTranslocation(shift_min=0, shift_max=20)

Apply augmentations to a sequence:

# Example DNA sequence (one-hot encoded)
sequence = torch.randn(1, 200, 4)  # [batch, length, channels]

# Apply single augmentation
augmented_seq = mutation(sequence)

# Apply multiple augmentations
augmentations = [mutation, deletion, translocation]
for aug in augmentations:
    sequence = aug(sequence)

Integration with PyTorch DataLoader

Create a custom dataset with augmentations:

from torch.utils.data import Dataset, DataLoader

class GenomicDataset(Dataset):
    def __init__(self, sequences, labels, augmentations=None):
        self.sequences = sequences
        self.labels = labels
        self.augmentations = augmentations or []

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]

        # Apply augmentations during training
        if self.augmentations:
            for aug in self.augmentations:
                seq = aug(seq)

        return seq, self.labels[idx]

# Create dataset with augmentations
dataset = GenomicDataset(
    sequences=your_sequences,
    labels=your_labels,
    augmentations=[mutation, deletion, translocation]
)

# Create DataLoader
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

Two-Stage Training Approach

Stage 1: Training with augmentations

# Train model with augmentations
model = YourModel()
optimizer = torch.optim.Adam(model.parameters())

for epoch in range(num_epochs):
    for batch_seqs, batch_labels in dataloader:
        # Forward pass
        outputs = model(batch_seqs)
        loss = criterion(outputs, batch_labels)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

Stage 2: Fine-tuning on original data

# Create dataloader without augmentations for fine-tuning
finetune_dataloader = DataLoader(base_dataset, batch_size=32, shuffle=True)

# Fine-tune with lower learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

for epoch in range(5):  # Fewer epochs for fine-tuning
    for batch_seqs, batch_labels in finetune_dataloader:
        outputs = model(batch_seqs)
        loss = criterion(outputs, batch_labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

Complete Example

Here’s a complete working example:

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from evoaug.augment import RandomMutation, RandomDeletion
from evoaug.evoaug import RobustLoader

# Simple model
class SimpleModel(nn.Module):
    def __init__(self, input_size=200, num_classes=2):
        super().__init__()
        self.conv1 = nn.Conv1d(4, 32, kernel_size=3, padding=1)
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(32, num_classes)

    def forward(self, x):
        x = x.transpose(1, 2)  # [batch, channels, length]
        x = torch.relu(self.conv1(x))
        x = self.pool(x).squeeze(-1)
        x = self.fc(x)
        return x

# Dataset
class GenomicDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

# Generate dummy data
num_samples = 1000
seq_length = 200
num_classes = 2

sequences = torch.randn(num_samples, seq_length, 4)
labels = torch.randint(0, num_classes, (num_samples,))

# Create dataset
dataset = GenomicDataset(sequences, labels)

# Create augmentations
augmentations = [
    RandomMutation(mut_frac=0.05),
    RandomDeletion(delete_min=0, delete_max=20)
]

# Create RobustLoader
train_loader = RobustLoader(
    base_dataset=dataset,
    augment_list=augmentations,
    max_augs_per_seq=1,
    hard_aug=False,
    batch_size=32,
    shuffle=True
)

# Create model and training components
model = SimpleModel()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
model.train()
for epoch in range(10):
    total_loss = 0
    for batch_seqs, batch_labels in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_seqs)
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")

print("Training completed!")

Next Steps

Now that you have the basics:

  1. Explore Augmentations: Check out the user_guide/augmentations for all available augmentation types

  2. Advanced Training: Learn about the two-stage approach in user_guide/training

  3. Examples: Run the complete examples in PyTorch Lightning Integration Example and Vanilla PyTorch Integration Example

  4. API Reference: Consult the EvoAug2 Core API for detailed class and method documentation

Need Help?

  • Check the user_guide/troubleshooting for common issues

  • Review the Examples section for more complex use cases

  • Visit the GitHub repository for issues and discussions