1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
|
# This is part of Kaylee
# -- this code is licensed GPLv3
# Copyright 2015 Clayton G. Hobbs
# Portions Copyright 2013 Jezra
import hashlib
import json
import re
import requests
class LanguageUpdater:
def __init__(self, config):
self.config = config
def update_language_if_changed(self):
"""Test if the language has changed, and if it has, update it"""
if self.language_has_changed():
self.update_language()
self.save_language_hash()
def language_has_changed(self):
"""Use SHA256 hashes to test if the language has changed"""
# Load the stored hash from the hash file
try:
with open(self.config.hash_file, 'r') as f:
hashes = json.load(f)
self.stored_hash = hashes['language']
except (IOError, KeyError, TypeError):
# No stored hash
self.stored_hash = ''
# Calculate the hash the language file has right now
hasher = hashlib.sha256()
with open(self.config.strings_file, 'rb') as sfile:
buf = sfile.read()
hasher.update(buf)
self.new_hash = hasher.hexdigest()
return self.new_hash != self.stored_hash
def update_language(self):
"""Update the language using the online lmtool"""
print('Updating language using online lmtool')
host = 'http://www.speech.cs.cmu.edu'
url = host + '/cgi-bin/tools/lmtool/run'
# Prepare request
files = {'corpus': open(self.config.strings_file, 'rb')}
values = {'formtype': 'simple'}
# Send corpus to the server
r = requests.post(url, files=files, data=values)
# Parse response to get URLs of the files we need
path_re = r'.*<title>Index of (.*?)</title>.*'
number_re = r'.*TAR([0-9]*?)\.tgz.*'
for line in r.text.split('\n'):
# If we found the directory, keep it and don't break
if re.search(path_re, line):
path = host + re.sub(path_re, r'\1', line)
# If we found the number, keep it and break
elif re.search(number_re, line):
number = re.sub(number_re, r'\1', line)
break
lm_url = path + '/' + number + '.lm'
dic_url = path + '/' + number + '.dic'
self._download_file(lm_url, self.config.lang_file)
self._download_file(dic_url, self.config.dic_file)
def save_language_hash(self):
new_hashes = {'language': self.new_hash}
with open(self.config.hash_file, 'w') as f:
json.dump(new_hashes, f)
def _download_file(self, url, path):
r = requests.get(url, stream=True)
if r.status_code == 200:
with open(path, 'wb') as f:
for chunk in r:
f.write(chunk)
|