32 lines
939 B
Python
32 lines
939 B
Python
#!/usr/bin/env python
|
|
# coding=utf-8
|
|
|
|
import os
|
|
import sys
|
|
import zipfile
|
|
|
|
from io import open
|
|
|
|
if os.path.exists('train.txt'):
|
|
print('Tokenized text8 already exists - skipping processing')
|
|
sys.exit()
|
|
|
|
data = zipfile.ZipFile('text8.zip').extractall()
|
|
data = open('text8', 'r', encoding='utf-8').read()
|
|
|
|
print('Length of text8: {}'.format(len(data)))
|
|
|
|
num_test_chars = 5000000
|
|
|
|
train_data = data[: -2 * num_test_chars]
|
|
valid_data = data[-2 * num_test_chars: -num_test_chars]
|
|
test_data = data[-num_test_chars:]
|
|
|
|
for fn, part in [('train.txt', train_data), ('valid.txt', valid_data), ('test.txt', test_data)]:
|
|
print('{} will have {} bytes'.format(fn, len(part)))
|
|
print('- Tokenizing...')
|
|
# Change space ' ' to underscore '_'
|
|
part_str = ' '.join(['_' if c == ' ' else c for c in part.strip()])
|
|
print('- Writing...')
|
|
f = open(fn, 'w').write(part_str)
|
|
f = open(fn + '.raw', 'w', encoding='utf-8').write(part)
|