feat: Smaller genome dataset

This commit is contained in:
Tibo De Peuter 2025-12-11 13:38:52 +01:00
parent d0457b6571
commit cd74949b74
Signed by: tdpeuter
GPG key ID: 38297DE43F75FFE2
5 changed files with 337 additions and 77 deletions

View file

@ -0,0 +1,48 @@
from typing import Callable
from datasets import load_dataset
from .Dataset import Dataset
class HumanReferenceGenomeDataset(Dataset):
"""
Hugging Face: https://huggingface.co/datasets/InstaDeepAI/human_reference_genome
:param split: 'train' | 'validation' | 'test'
:param config: '6kbp' | '12kbp' (chunk length in the HF builder config)
"""
def __init__(self,
root: str | None = None,
split: str = "train",
transform: Callable = None,
size: int = -1,
config: str = "6kbp",
):
super().__init__("human_reference_genome", root, split, transform, size)
print(f"Loading from HuggingFace (config: {config}, split: {split})")
ds = load_dataset("InstaDeepAI/human_reference_genome", config, split=split,
cache_dir=self.root, trust_remote_code=True)
# Your Dataset.process_data() expects a list[str]; use the 'sequence' field
self.data = ds["sequence"]
self.context_length = 2048
self.process_data()
print("Done initializing dataset")
def __len__(self):
return self.chunk_offsets[-1] - self.context_length
def __getitem__(self, idx):
x = self.tensor[idx: idx + self.context_length]
y = self.tensor[idx + self.context_length]
if self.transform:
x = self.transform(x)
return x, y

View file

@ -1,10 +1,12 @@
from .Dataset import Dataset
from .EnWik9 import EnWik9DataSet
from .HumanReferenceGenomeDataset import HumanReferenceGenomeDataset
from .LoremIpsumDataset import LoremIpsumDataset
from .OpenGenomeDataset import OpenGenomeDataset
dataset_called: dict[str, type[Dataset]] = {
'enwik9': EnWik9DataSet,
'lorem_ipsum': LoremIpsumDataset,
'opengenome': OpenGenomeDataset
'opengenome': OpenGenomeDataset,
'humanreference': HumanReferenceGenomeDataset
}