diff options
Diffstat (limited to 'languageupdater.py')
-rw-r--r-- | languageupdater.py | 83 |
1 files changed, 83 insertions, 0 deletions
diff --git a/languageupdater.py b/languageupdater.py new file mode 100644 index 0000000..a82e023 --- /dev/null +++ b/languageupdater.py @@ -0,0 +1,83 @@ +# This is part of Kaylee +# -- this code is licensed GPLv3 +# Copyright 2013 Jezra +# Copyright 2015 Clayton G. Hobbs + +import hashlib +import json +import re + +import requests + +class LanguageUpdater: + + def __init__(self, config): + self.config = config + + def update_language_if_changed(self): + """Test if the language has changed, and if it has, update it""" + if self.language_has_changed(): + self.update_language() + self.save_language_hash() + + def language_has_changed(self): + """Use SHA256 hashes to test if the language has changed""" + # Load the stored hash from the hash file + try: + with open(self.config.hash_file, 'r') as f: + hashes = json.load(f) + self.stored_hash = hashes['language'] + except (IOError, KeyError, TypeError): + # No stored hash + self.stored_hash = '' + + # Calculate the hash the language file has right now + hasher = hashlib.sha256() + with open(self.config.strings_file, 'rb') as sfile: + buf = sfile.read() + hasher.update(buf) + self.new_hash = hasher.hexdigest() + + return self.new_hash != self.stored_hash + + def update_language(self): + """Update the language using the online lmtool""" + print('Updating language using online lmtool') + + host = 'http://www.speech.cs.cmu.edu' + url = host + '/cgi-bin/tools/lmtool/run' + + # Prepare request + files = {'corpus': open(self.config.strings_file, 'rb')} + values = {'formtype': 'simple'} + + # Send corpus to the server + r = requests.post(url, files=files, data=values) + + # Parse response to get URLs of the files we need + for line in r.text.split('\n'): + # If we found the directory, keep it and don't break + if re.search(r'.*<title>Index of (.*?)</title>.*', line): + path = host + re.sub(r'.*<title>Index of (.*?)</title>.*', r'\1', line) + # If we found the number, keep it and break + elif re.search(r'.*TAR[0-9]*?\.tgz.*', line): + number = re.sub(r'.*TAR([0-9]*?)\.tgz.*', r'\1', line) + break + + lm_url = path + '/' + number + '.lm' + dic_url = path + '/' + number + '.dic' + + self._download_file(lm_url, self.config.lang_file) + self._download_file(dic_url, self.config.dic_file) + + def save_language_hash(self): + new_hashes = {'language': self.new_hash} + with open(self.config.hash_file, 'w') as f: + json.dump(new_hashes, f) + + def _download_file(self, url, path): + r = requests.get(url, stream=True) + if r.status_code == 200: + with open(path, 'wb') as f: + for chunk in r: + f.write(chunk) |