chore(transformer-xl): Initial commit

This commit is contained in:
Tibo De Peuter 2025-11-07 12:58:13 +01:00
parent ef4684ef39
commit 10512876f2
Signed by: tdpeuter
GPG key ID: 38297DE43F75FFE2
46 changed files with 10547 additions and 0 deletions

View file

@ -0,0 +1,32 @@
#!/usr/bin/env python
# coding=utf-8
import os
import sys
import zipfile
from io import open
if os.path.exists('train.txt'):
print('Tokenized text8 already exists - skipping processing')
sys.exit()
data = zipfile.ZipFile('text8.zip').extractall()
data = open('text8', 'r', encoding='utf-8').read()
print('Length of text8: {}'.format(len(data)))
num_test_chars = 5000000
train_data = data[: -2 * num_test_chars]
valid_data = data[-2 * num_test_chars: -num_test_chars]
test_data = data[-num_test_chars:]
for fn, part in [('train.txt', train_data), ('valid.txt', valid_data), ('test.txt', test_data)]:
print('{} will have {} bytes'.format(fn, len(part)))
print('- Tokenizing...')
# Change space ' ' to underscore '_'
part_str = ' '.join(['_' if c == ' ' else c for c in part.strip()])
print('- Writing...')
f = open(fn, 'w').write(part_str)
f = open(fn + '.raw', 'w', encoding='utf-8').write(part)