1 files changed, 83 insertions, 0 deletions
diff --git a/languageupdater.py b/languageupdater.py
new file mode 100644
index 0000000..a82e023
--- /dev/null
+++ b/languageupdater.py
@@ -0,0 +1,83 @@
+# This is part of Kaylee
+# -- this code is licensed GPLv3
+# Copyright 2013 Jezra
+# Copyright 2015 Clayton G. Hobbs
+
+import hashlib
+import json
+import re
+
+import requests
+
+class LanguageUpdater:
+
+    def __init__(self, config):
+        self.config = config
+
+    def update_language_if_changed(self):
+        """Test if the language has changed, and if it has, update it"""
+        if self.language_has_changed():
+            self.update_language()
+            self.save_language_hash()
+
+    def language_has_changed(self):
+        """Use SHA256 hashes to test if the language has changed"""
+        # Load the stored hash from the hash file
+        try:
+            with open(self.config.hash_file, 'r') as f:
+                hashes = json.load(f)
+            self.stored_hash = hashes['language']
+        except (IOError, KeyError, TypeError):
+            # No stored hash
+            self.stored_hash = ''
+
+        # Calculate the hash the language file has right now
+        hasher = hashlib.sha256()
+        with open(self.config.strings_file, 'rb') as sfile:
+            buf = sfile.read()
+            hasher.update(buf)
+        self.new_hash = hasher.hexdigest()
+
+        return self.new_hash != self.stored_hash
+
+    def update_language(self):
+        """Update the language using the online lmtool"""
+        print('Updating language using online lmtool')
+
+        host = 'http://www.speech.cs.cmu.edu'
+        url = host + '/cgi-bin/tools/lmtool/run'
+
+        # Prepare request
+        files = {'corpus': open(self.config.strings_file, 'rb')}
+        values = {'formtype': 'simple'}
+
+        # Send corpus to the server
+        r = requests.post(url, files=files, data=values)
+
+        # Parse response to get URLs of the files we need
+        for line in r.text.split('\n'):
+            # If we found the directory, keep it and don't break
+            if re.search(r'.*<title>Index of (.*?)</title>.*', line):
+                path = host + re.sub(r'.*<title>Index of (.*?)</title>.*', r'\1', line)
+            # If we found the number, keep it and break
+            elif re.search(r'.*TAR[0-9]*?\.tgz.*', line):
+                number = re.sub(r'.*TAR([0-9]*?)\.tgz.*', r'\1', line)
+                break
+
+        lm_url = path + '/' + number + '.lm'
+        dic_url = path + '/' + number + '.dic'
+
+        self._download_file(lm_url, self.config.lang_file)
+        self._download_file(dic_url, self.config.dic_file)
+
+    def save_language_hash(self):
+        new_hashes = {'language': self.new_hash}
+        with open(self.config.hash_file, 'w') as f:
+            json.dump(new_hashes, f)
+
+    def _download_file(self, url, path):
+        r = requests.get(url, stream=True)
+        if r.status_code == 200:
+            with open(path, 'wb') as f:
+                for chunk in r:
+                    f.write(chunk)