diff --git a/src/dataset_loaders/Dataset.py b/src/dataset_loaders/Dataset.py index 63763af..bc643dd 100644 --- a/src/dataset_loaders/Dataset.py +++ b/src/dataset_loaders/Dataset.py @@ -49,12 +49,12 @@ class Dataset(TorchDataset, ABC): return len(self.dataset) def process_data(self): + self.chunk_offsets = self.get_offsets() if self.size == -1: # Just use the whole dataset self.bytes = ''.join(tqdm(self.data, desc="Encoding data")).encode('utf-8', errors='replace') else: # Use only partition, calculate offsets - self.chunk_offsets = self.get_offsets() self.bytes = ''.join(tqdm(self.data[:len(self.chunk_offsets)], desc="Encoding data")).encode('utf-8', errors='replace') self.tensor = torch.tensor(list(self.bytes), dtype=torch.long)