Validation loop starts before training epoch is completed #1020
-
I can not figure out why my validation loop starts before the training loop even finished! My def training_step(
self,
batch: Tuple[torch.Tensor, torch.Tensor],
batch_idx: int,
dataloader_idx: int = 0,
optimizer_idx: int = 0
) -> STEP_OUTPUT:
image, label = batch
# noinspection PyArgumentList
current_batch_size = image.size(0)
prediction = self.forward(image)
current_loss = self.loss_function(prediction, label)
self.training_metrics.update(preds=prediction, target=label)
self.training_loss.update(
value=current_loss.clone().detach().squeeze(),
weight=(1.0 / current_batch_size)
)
self.log(
name="Training-Mean_Loss",
value=self.training_loss,
on_step=False,
on_epoch=True,
logger=True,
sync_dist=False
)
self.log_dict(
dictionary=self.training_metrics.loggable_dict(),
on_step=False,
on_epoch=True,
logger=True,
sync_dist=False
)
return {
'loss': current_loss
}
def validation_step(
self,
batch: Tuple[torch.Tensor, torch.Tensor],
batch_idx: int,
dataloader_idx: int = 0
) -> Optional[STEP_OUTPUT]:
image, label = batch
# noinspection PyArgumentList
current_batch_size = image.size(0)
prediction = self.forward(image)
current_loss = self.loss_function(prediction, label)
self.validation_metrics.update(preds=prediction, target=label)
self.validation_loss.update(
value=current_loss.clone().detach().squeeze(),
weight=(1.0 / current_batch_size)
)
self.log(
name="Validation-Mean_Loss",
value=self.validation_loss,
on_step=False,
on_epoch=True,
logger=True,
sync_dist=False
)
self.log_dict(
dictionary=self.validation_metrics.loggable_dict(),
on_step=False,
on_epoch=True,
logger=True,
sync_dist=False
)
return {
'Validation-Loss': current_loss
} My trainer: trainer = Trainer(
logger=TensorBoardLogger(save_dir="logs", name='Run-1'),
callbacks=[
RichProgressBar(),
ModelCheckpoint(
dirpath="checkpoints",
filename='FloodNet-{epoch}-{Validation-Mean_Loss:.3f}',
monitor='Validation-Mean_Loss',
save_top_k=2,
save_last=True,
save_on_train_epoch_end=False
),
EarlyStopping(
monitor="Validation-Mean_Loss",
mode="min",
patience=5,
strict=True,
check_finite=True,
min_delta=1e-3,
check_on_train_epoch_end=False,
)
],
check_val_every_n_epoch=1,
num_sanity_val_steps=0,
detect_anomaly=False,
log_every_n_steps=10,
enable_progress_bar=True,
precision=16,
strategy=DDPStrategy(find_unused_parameters=False),
sync_batchnorm=False,
enable_model_summary=False,
max_epochs=100,
accelerator="gpu",
devices=-1
) |
Beta Was this translation helpful? Give feedback.
Replies: 1 comment 2 replies
-
This questions is more related to repository for lightning than torchmetrics. That said, the reason is that an |
Beta Was this translation helpful? Give feedback.
This questions is more related to repository for lightning than torchmetrics. That said, the reason is that an
epoch
in lightning is considered to be BOTH training + validation. So you are actually done passing over your training data when you see the validation progress bar. To confirm this you could try checking the length of your training dataloader, that it matches the number where the validation progress bar starts.