chore(transformer-xl): Initial commit
This commit is contained in:
parent
ef4684ef39
commit
10512876f2
46 changed files with 10547 additions and 0 deletions
32
transformer-xl/prep_text8.py
Normal file
32
transformer-xl/prep_text8.py
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
|
||||
import os
|
||||
import sys
|
||||
import zipfile
|
||||
|
||||
from io import open
|
||||
|
||||
if os.path.exists('train.txt'):
|
||||
print('Tokenized text8 already exists - skipping processing')
|
||||
sys.exit()
|
||||
|
||||
data = zipfile.ZipFile('text8.zip').extractall()
|
||||
data = open('text8', 'r', encoding='utf-8').read()
|
||||
|
||||
print('Length of text8: {}'.format(len(data)))
|
||||
|
||||
num_test_chars = 5000000
|
||||
|
||||
train_data = data[: -2 * num_test_chars]
|
||||
valid_data = data[-2 * num_test_chars: -num_test_chars]
|
||||
test_data = data[-num_test_chars:]
|
||||
|
||||
for fn, part in [('train.txt', train_data), ('valid.txt', valid_data), ('test.txt', test_data)]:
|
||||
print('{} will have {} bytes'.format(fn, len(part)))
|
||||
print('- Tokenizing...')
|
||||
# Change space ' ' to underscore '_'
|
||||
part_str = ' '.join(['_' if c == ' ' else c for c in part.strip()])
|
||||
print('- Writing...')
|
||||
f = open(fn, 'w').write(part_str)
|
||||
f = open(fn + '.raw', 'w', encoding='utf-8').write(part)
|
||||
Reference in a new issue