feat: Context CLI arg

This commit is contained in:
Tibo De Peuter 2025-12-11 13:58:38 +01:00
parent cd74949b74
commit a4583d402b
Signed by: tdpeuter
GPG key ID: 38297DE43F75FFE2
8 changed files with 38 additions and 31 deletions

View file

@ -18,18 +18,15 @@ class HumanReferenceGenomeDataset(Dataset):
split: str = "train",
transform: Callable = None,
size: int = -1,
context_length: int = 1024,
config: str = "6kbp",
):
super().__init__("human_reference_genome", root, split, transform, size)
super().__init__("human_reference_genome", root, split, transform, size, context_length)
print(f"Loading from HuggingFace (config: {config}, split: {split})")
ds = load_dataset("InstaDeepAI/human_reference_genome", config, split=split,
data = load_dataset("InstaDeepAI/human_reference_genome", config, split=split,
cache_dir=self.root, trust_remote_code=True)
# Your Dataset.process_data() expects a list[str]; use the 'sequence' field
self.data = ds["sequence"]
self.context_length = 2048
self.data = data["sequence"]
self.process_data()