feat: Smaller genome dataset
This commit is contained in:
parent
d0457b6571
commit
cd74949b74
5 changed files with 337 additions and 77 deletions
48
src/dataset_loaders/HumanReferenceGenomeDataset.py
Normal file
48
src/dataset_loaders/HumanReferenceGenomeDataset.py
Normal file
|
|
@ -0,0 +1,48 @@
|
|||
from typing import Callable
|
||||
|
||||
from datasets import load_dataset
|
||||
|
||||
from .Dataset import Dataset
|
||||
|
||||
|
||||
class HumanReferenceGenomeDataset(Dataset):
|
||||
"""
|
||||
Hugging Face: https://huggingface.co/datasets/InstaDeepAI/human_reference_genome
|
||||
|
||||
:param split: 'train' | 'validation' | 'test'
|
||||
:param config: '6kbp' | '12kbp' (chunk length in the HF builder config)
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
root: str | None = None,
|
||||
split: str = "train",
|
||||
transform: Callable = None,
|
||||
size: int = -1,
|
||||
config: str = "6kbp",
|
||||
):
|
||||
super().__init__("human_reference_genome", root, split, transform, size)
|
||||
|
||||
print(f"Loading from HuggingFace (config: {config}, split: {split})")
|
||||
ds = load_dataset("InstaDeepAI/human_reference_genome", config, split=split,
|
||||
cache_dir=self.root, trust_remote_code=True)
|
||||
|
||||
# Your Dataset.process_data() expects a list[str]; use the 'sequence' field
|
||||
self.data = ds["sequence"]
|
||||
|
||||
self.context_length = 2048
|
||||
|
||||
self.process_data()
|
||||
|
||||
print("Done initializing dataset")
|
||||
|
||||
def __len__(self):
|
||||
return self.chunk_offsets[-1] - self.context_length
|
||||
|
||||
def __getitem__(self, idx):
|
||||
x = self.tensor[idx: idx + self.context_length]
|
||||
y = self.tensor[idx + self.context_length]
|
||||
|
||||
if self.transform:
|
||||
x = self.transform(x)
|
||||
|
||||
return x, y
|
||||
|
|
@ -1,10 +1,12 @@
|
|||
from .Dataset import Dataset
|
||||
from .EnWik9 import EnWik9DataSet
|
||||
from .HumanReferenceGenomeDataset import HumanReferenceGenomeDataset
|
||||
from .LoremIpsumDataset import LoremIpsumDataset
|
||||
from .OpenGenomeDataset import OpenGenomeDataset
|
||||
|
||||
dataset_called: dict[str, type[Dataset]] = {
|
||||
'enwik9': EnWik9DataSet,
|
||||
'lorem_ipsum': LoremIpsumDataset,
|
||||
'opengenome': OpenGenomeDataset
|
||||
'opengenome': OpenGenomeDataset,
|
||||
'humanreference': HumanReferenceGenomeDataset
|
||||
}
|
||||
|
|
|
|||
Reference in a new issue