4 files changed, 94 insertions, 83 deletions
diff --git a/README.md b/README.md
index b2949b6..22729a0 100644
--- a/README.md
+++ b/README.md
@@ -17,35 +17,21 @@ but adds a lot of features that go beyond the original purpose of Blather.
 
 ## Usage
 
-1. Move commands.tmp to ~/.config/blather/commands.conf and fill the file with
+1. Move commands.tmp to ~/.config/kaylee/commands.conf and fill the file with
 sentences and command to run
-2. Run blather.py, this will generate ~/.config/blather/sentences.corpus based
-on sentences in the 'commands' file
-3. Quit Kaylee (there is a good chance it will just segfault)
-4. Go to <http://www.speech.cs.cmu.edu/tools/lmtool.html> and upload the
-sentences.corpus file
-5. Download the resulting XXXX.lm file to the ~/.config/blather/language
-directory and rename to file to 'lm'
-6. Download the resulting XXXX.dic file to the ~/.config/blather/language
-directory and rename to file to 'dic'
-7. Run blather.py
+2. Run blather.py.  This will generate ~/.local/share/kaylee/sentences.corpus
+based on sentences in the 'commands' file, then use
+<http://www.speech.cs.cmu.edu/tools/lmtool.html> to create and save a new
+language model and dictionary.
     * For GTK UI, run blather.py -i g
     * To start a UI in 'continuous' listen mode, use the -c flag
     * To use a microphone other than the system default, use the -m flag
-8. Start talking
+3. Start talking
 
 **Note:** to start Kaylee without needing to enter command line options all the
-time, copy options.json.tmp to ~/.config/blather/options.json and edit
+time, copy options.json.tmp to ~/.config/kaylee/options.json and edit
 accordingly.
 
-### Bonus
-
-~~Once the sentences.corpus file has been created, run the language_updater.sh
-script to automate the process of creating and downloading language files.~~
-
-Kaylee now updates the language automatically.  You should never need to run
-language_updater.sh manually.
-
 ### Examples
 
 * To run Kaylee with the GTK UI and start in continuous listen mode:
diff --git a/blather.py b/blather.py
index a90afe3..23802e8 100755
--- a/blather.py
+++ b/blather.py
@@ -16,6 +16,7 @@ import json
 
 from recognizer import Recognizer
 from config import Config
+from languageupdater import LanguageUpdater
 
 
 class Blather:
@@ -35,7 +36,7 @@ class Blather:
         # Read the commands
         self.read_commands()
 
-        if self.options['interface'] != None:
+        if self.options['interface']:
             if self.options['interface'] == "g":
                 from gtkui import UI
             elif self.options['interface'] == "gt":
@@ -59,7 +60,8 @@ class Blather:
             self.history = []
 
         # Update the language if necessary
-        self.update_language()
+        self.language_updater = LanguageUpdater(self.config)
+        self.language_updater.update_language_if_changed()
 
         # Create the recognizer
         self.recognizer = Recognizer(self.config)
@@ -95,34 +97,6 @@ class Blather:
             # Close the file
             hfile.close()
 
-    def update_language(self):
-        """Update the language if its hash has changed"""
-        # Load the stored hash from the hash file
-        try:
-            with open(self.config.hash_file, 'r') as f:
-                hashes = json.load(f)
-            stored_hash = hashes['language']
-        except (IOError, KeyError, TypeError):
-            # No stored hash
-            stored_hash = ''
-
-        # Calculate the hash the language file has right now
-        hasher = hashlib.sha256()
-        with open(self.config.strings_file, 'rb') as sfile:
-            buf = sfile.read()
-            hasher.update(buf)
-        new_hash = hasher.hexdigest()
-
-        # If the hashes differ
-        if stored_hash != new_hash:
-            # Update the language
-            # FIXME: Do this with Python, not Bash
-            self.run_command('./language_updater.sh')
-            # Store the new hash
-            new_hashes = {'language': new_hash}
-            with open(self.config.hash_file, 'w') as f:
-                json.dump(new_hashes, f)
-
     def run_command(self, cmd):
         """Print the command, then run it"""
         print(cmd)
diff --git a/language_updater.sh b/language_updater.sh
deleted file mode 100755
index 5a2c232..0000000
--- a/language_updater.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-
-blatherdir=~/.config/kaylee
-blatherdatadir=~/.local/share/kaylee
-blathercachedir=~/.cache/kaylee
-sentences=$blatherdatadir/sentences.corpus
-sourcefile=$blatherdir/commands.conf
-tempfile=$blathercachedir/url.txt
-lmtoolurl=http://www.speech.cs.cmu.edu/cgi-bin/tools/lmtool/run
-
-cd $blatherdir
-
-sed -f - $sourcefile > $sentences <<EOFcommands
-  /^$/d
-  /^#/d
-  s/\:.*$//
-EOFcommands
-
-# upload corpus file, find the resulting dictionary file url
-curl -L -F corpus=@"$sentences" -F formtype=simple $lmtoolurl \
-  |grep -A 1 "base name" |grep http \
-  | sed -e 's/^.*\="//' | sed -e 's/\.tgz.*$//' | sed -e 's/TAR//' > $tempfile
-
-# download the .dic and .lm files
-curl -C - -O $(cat $tempfile).dic
-curl -C - -O $(cat $tempfile).lm
-
-# mv em to the right name/place
-mv *.dic $blatherdatadir/dic
-mv *.lm $blatherdatadir/lm
-
-rm $tempfile
diff --git a/languageupdater.py b/languageupdater.py
new file mode 100644
index 0000000..a82e023
--- /dev/null
+++ b/languageupdater.py
@@ -0,0 +1,83 @@
+# This is part of Kaylee
+# -- this code is licensed GPLv3
+# Copyright 2013 Jezra
+# Copyright 2015 Clayton G. Hobbs
+
+import hashlib
+import json
+import re
+
+import requests
+
+class LanguageUpdater:
+
+    def __init__(self, config):
+        self.config = config
+
+    def update_language_if_changed(self):
+        """Test if the language has changed, and if it has, update it"""
+        if self.language_has_changed():
+            self.update_language()
+            self.save_language_hash()
+
+    def language_has_changed(self):
+        """Use SHA256 hashes to test if the language has changed"""
+        # Load the stored hash from the hash file
+        try:
+            with open(self.config.hash_file, 'r') as f:
+                hashes = json.load(f)
+            self.stored_hash = hashes['language']
+        except (IOError, KeyError, TypeError):
+            # No stored hash
+            self.stored_hash = ''
+
+        # Calculate the hash the language file has right now
+        hasher = hashlib.sha256()
+        with open(self.config.strings_file, 'rb') as sfile:
+            buf = sfile.read()
+            hasher.update(buf)
+        self.new_hash = hasher.hexdigest()
+
+        return self.new_hash != self.stored_hash
+
+    def update_language(self):
+        """Update the language using the online lmtool"""
+        print('Updating language using online lmtool')
+
+        host = 'http://www.speech.cs.cmu.edu'
+        url = host + '/cgi-bin/tools/lmtool/run'
+
+        # Prepare request
+        files = {'corpus': open(self.config.strings_file, 'rb')}
+        values = {'formtype': 'simple'}
+
+        # Send corpus to the server
+        r = requests.post(url, files=files, data=values)
+
+        # Parse response to get URLs of the files we need
+        for line in r.text.split('\n'):
+            # If we found the directory, keep it and don't break
+            if re.search(r'.*<title>Index of (.*?)</title>.*', line):
+                path = host + re.sub(r'.*<title>Index of (.*?)</title>.*', r'\1', line)
+            # If we found the number, keep it and break
+            elif re.search(r'.*TAR[0-9]*?\.tgz.*', line):
+                number = re.sub(r'.*TAR([0-9]*?)\.tgz.*', r'\1', line)
+                break
+
+        lm_url = path + '/' + number + '.lm'
+        dic_url = path + '/' + number + '.dic'
+
+        self._download_file(lm_url, self.config.lang_file)
+        self._download_file(dic_url, self.config.dic_file)
+
+    def save_language_hash(self):
+        new_hashes = {'language': self.new_hash}
+        with open(self.config.hash_file, 'w') as f:
+            json.dump(new_hashes, f)
+
+    def _download_file(self, url, path):
+        r = requests.get(url, stream=True)
+        if r.status_code == 200:
+            with open(path, 'wb') as f:
+                for chunk in r:
+                    f.write(chunk)