From e4b693b2061a0e3d93feba4fa570df7424bbe0d4 Mon Sep 17 00:00:00 2001 From: "Clayton G. Hobbs" Date: Sun, 27 Dec 2015 17:00:45 -0500 Subject: Rewrote language_updater.sh in Python At the same time, I moved the logic to check if the language should be updated into the new LanguageUpdater class. The README has been updated to reflect the fact that you no longer need to do any of this manually ever. --- languageupdater.py | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 languageupdater.py (limited to 'languageupdater.py') diff --git a/languageupdater.py b/languageupdater.py new file mode 100644 index 0000000..a82e023 --- /dev/null +++ b/languageupdater.py @@ -0,0 +1,83 @@ +# This is part of Kaylee +# -- this code is licensed GPLv3 +# Copyright 2013 Jezra +# Copyright 2015 Clayton G. Hobbs + +import hashlib +import json +import re + +import requests + +class LanguageUpdater: + + def __init__(self, config): + self.config = config + + def update_language_if_changed(self): + """Test if the language has changed, and if it has, update it""" + if self.language_has_changed(): + self.update_language() + self.save_language_hash() + + def language_has_changed(self): + """Use SHA256 hashes to test if the language has changed""" + # Load the stored hash from the hash file + try: + with open(self.config.hash_file, 'r') as f: + hashes = json.load(f) + self.stored_hash = hashes['language'] + except (IOError, KeyError, TypeError): + # No stored hash + self.stored_hash = '' + + # Calculate the hash the language file has right now + hasher = hashlib.sha256() + with open(self.config.strings_file, 'rb') as sfile: + buf = sfile.read() + hasher.update(buf) + self.new_hash = hasher.hexdigest() + + return self.new_hash != self.stored_hash + + def update_language(self): + """Update the language using the online lmtool""" + print('Updating language using online lmtool') + + host = 'http://www.speech.cs.cmu.edu' + url = host + '/cgi-bin/tools/lmtool/run' + + # Prepare request + files = {'corpus': open(self.config.strings_file, 'rb')} + values = {'formtype': 'simple'} + + # Send corpus to the server + r = requests.post(url, files=files, data=values) + + # Parse response to get URLs of the files we need + for line in r.text.split('\n'): + # If we found the directory, keep it and don't break + if re.search(r'.*Index of (.*?).*', line): + path = host + re.sub(r'.*Index of (.*?).*', r'\1', line) + # If we found the number, keep it and break + elif re.search(r'.*TAR[0-9]*?\.tgz.*', line): + number = re.sub(r'.*TAR([0-9]*?)\.tgz.*', r'\1', line) + break + + lm_url = path + '/' + number + '.lm' + dic_url = path + '/' + number + '.dic' + + self._download_file(lm_url, self.config.lang_file) + self._download_file(dic_url, self.config.dic_file) + + def save_language_hash(self): + new_hashes = {'language': self.new_hash} + with open(self.config.hash_file, 'w') as f: + json.dump(new_hashes, f) + + def _download_file(self, url, path): + r = requests.get(url, stream=True) + if r.status_code == 200: + with open(path, 'wb') as f: + for chunk in r: + f.write(chunk) -- cgit 1.4.1