summary refs log tree commit diff
diff options
context:
space:
mode:
authorClayton G. Hobbs <clay@lakeserv.net>2015-12-27 17:00:45 -0500
committerClayton G. Hobbs <clay@lakeserv.net>2015-12-27 17:00:45 -0500
commite4b693b2061a0e3d93feba4fa570df7424bbe0d4 (patch)
treed436cb5242d33971fe37949c83583ad4a9e15fc6
parentc5578954ed54a8569014105fd75aa5fe07ba1c89 (diff)
Rewrote language_updater.sh in Python
At the same time, I moved the logic to check if the language should be
updated into the new LanguageUpdater class.  The README has been updated
to reflect the fact that you no longer need to do any of this manually
ever.
-rw-r--r--README.md28
-rwxr-xr-xblather.py34
-rwxr-xr-xlanguage_updater.sh32
-rw-r--r--languageupdater.py83
4 files changed, 94 insertions, 83 deletions
diff --git a/README.md b/README.md
index b2949b6..22729a0 100644
--- a/README.md
+++ b/README.md
@@ -17,35 +17,21 @@ but adds a lot of features that go beyond the original purpose of Blather.
 
 ## Usage
 
-1. Move commands.tmp to ~/.config/blather/commands.conf and fill the file with
+1. Move commands.tmp to ~/.config/kaylee/commands.conf and fill the file with
 sentences and command to run
-2. Run blather.py, this will generate ~/.config/blather/sentences.corpus based
-on sentences in the 'commands' file
-3. Quit Kaylee (there is a good chance it will just segfault)
-4. Go to <http://www.speech.cs.cmu.edu/tools/lmtool.html> and upload the
-sentences.corpus file
-5. Download the resulting XXXX.lm file to the ~/.config/blather/language
-directory and rename to file to 'lm'
-6. Download the resulting XXXX.dic file to the ~/.config/blather/language
-directory and rename to file to 'dic'
-7. Run blather.py
+2. Run blather.py.  This will generate ~/.local/share/kaylee/sentences.corpus
+based on sentences in the 'commands' file, then use
+<http://www.speech.cs.cmu.edu/tools/lmtool.html> to create and save a new
+language model and dictionary.
     * For GTK UI, run blather.py -i g
     * To start a UI in 'continuous' listen mode, use the -c flag
     * To use a microphone other than the system default, use the -m flag
-8. Start talking
+3. Start talking
 
 **Note:** to start Kaylee without needing to enter command line options all the
-time, copy options.json.tmp to ~/.config/blather/options.json and edit
+time, copy options.json.tmp to ~/.config/kaylee/options.json and edit
 accordingly.
 
-### Bonus
-
-~~Once the sentences.corpus file has been created, run the language_updater.sh
-script to automate the process of creating and downloading language files.~~
-
-Kaylee now updates the language automatically.  You should never need to run
-language_updater.sh manually.
-
 ### Examples
 
 * To run Kaylee with the GTK UI and start in continuous listen mode:
diff --git a/blather.py b/blather.py
index a90afe3..23802e8 100755
--- a/blather.py
+++ b/blather.py
@@ -16,6 +16,7 @@ import json
 
 from recognizer import Recognizer
 from config import Config
+from languageupdater import LanguageUpdater
 
 
 class Blather:
@@ -35,7 +36,7 @@ class Blather:
         # Read the commands
         self.read_commands()
 
-        if self.options['interface'] != None:
+        if self.options['interface']:
             if self.options['interface'] == "g":
                 from gtkui import UI
             elif self.options['interface'] == "gt":
@@ -59,7 +60,8 @@ class Blather:
             self.history = []
 
         # Update the language if necessary
-        self.update_language()
+        self.language_updater = LanguageUpdater(self.config)
+        self.language_updater.update_language_if_changed()
 
         # Create the recognizer
         self.recognizer = Recognizer(self.config)
@@ -95,34 +97,6 @@ class Blather:
             # Close the file
             hfile.close()
 
-    def update_language(self):
-        """Update the language if its hash has changed"""
-        # Load the stored hash from the hash file
-        try:
-            with open(self.config.hash_file, 'r') as f:
-                hashes = json.load(f)
-            stored_hash = hashes['language']
-        except (IOError, KeyError, TypeError):
-            # No stored hash
-            stored_hash = ''
-
-        # Calculate the hash the language file has right now
-        hasher = hashlib.sha256()
-        with open(self.config.strings_file, 'rb') as sfile:
-            buf = sfile.read()
-            hasher.update(buf)
-        new_hash = hasher.hexdigest()
-
-        # If the hashes differ
-        if stored_hash != new_hash:
-            # Update the language
-            # FIXME: Do this with Python, not Bash
-            self.run_command('./language_updater.sh')
-            # Store the new hash
-            new_hashes = {'language': new_hash}
-            with open(self.config.hash_file, 'w') as f:
-                json.dump(new_hashes, f)
-
     def run_command(self, cmd):
         """Print the command, then run it"""
         print(cmd)
diff --git a/language_updater.sh b/language_updater.sh
deleted file mode 100755
index 5a2c232..0000000
--- a/language_updater.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-
-blatherdir=~/.config/kaylee
-blatherdatadir=~/.local/share/kaylee
-blathercachedir=~/.cache/kaylee
-sentences=$blatherdatadir/sentences.corpus
-sourcefile=$blatherdir/commands.conf
-tempfile=$blathercachedir/url.txt
-lmtoolurl=http://www.speech.cs.cmu.edu/cgi-bin/tools/lmtool/run
-
-cd $blatherdir
-
-sed -f - $sourcefile > $sentences <<EOFcommands
-  /^$/d
-  /^#/d
-  s/\:.*$//
-EOFcommands
-
-# upload corpus file, find the resulting dictionary file url
-curl -L -F corpus=@"$sentences" -F formtype=simple $lmtoolurl \
-  |grep -A 1 "base name" |grep http \
-  | sed -e 's/^.*\="//' | sed -e 's/\.tgz.*$//' | sed -e 's/TAR//' > $tempfile
-
-# download the .dic and .lm files
-curl -C - -O $(cat $tempfile).dic
-curl -C - -O $(cat $tempfile).lm
-
-# mv em to the right name/place
-mv *.dic $blatherdatadir/dic
-mv *.lm $blatherdatadir/lm
-
-rm $tempfile
diff --git a/languageupdater.py b/languageupdater.py
new file mode 100644
index 0000000..a82e023
--- /dev/null
+++ b/languageupdater.py
@@ -0,0 +1,83 @@
+# This is part of Kaylee
+# -- this code is licensed GPLv3
+# Copyright 2013 Jezra
+# Copyright 2015 Clayton G. Hobbs
+
+import hashlib
+import json
+import re
+
+import requests
+
+class LanguageUpdater:
+
+    def __init__(self, config):
+        self.config = config
+
+    def update_language_if_changed(self):
+        """Test if the language has changed, and if it has, update it"""
+        if self.language_has_changed():
+            self.update_language()
+            self.save_language_hash()
+
+    def language_has_changed(self):
+        """Use SHA256 hashes to test if the language has changed"""
+        # Load the stored hash from the hash file
+        try:
+            with open(self.config.hash_file, 'r') as f:
+                hashes = json.load(f)
+            self.stored_hash = hashes['language']
+        except (IOError, KeyError, TypeError):
+            # No stored hash
+            self.stored_hash = ''
+
+        # Calculate the hash the language file has right now
+        hasher = hashlib.sha256()
+        with open(self.config.strings_file, 'rb') as sfile:
+            buf = sfile.read()
+            hasher.update(buf)
+        self.new_hash = hasher.hexdigest()
+
+        return self.new_hash != self.stored_hash
+
+    def update_language(self):
+        """Update the language using the online lmtool"""
+        print('Updating language using online lmtool')
+
+        host = 'http://www.speech.cs.cmu.edu'
+        url = host + '/cgi-bin/tools/lmtool/run'
+
+        # Prepare request
+        files = {'corpus': open(self.config.strings_file, 'rb')}
+        values = {'formtype': 'simple'}
+
+        # Send corpus to the server
+        r = requests.post(url, files=files, data=values)
+
+        # Parse response to get URLs of the files we need
+        for line in r.text.split('\n'):
+            # If we found the directory, keep it and don't break
+            if re.search(r'.*<title>Index of (.*?)</title>.*', line):
+                path = host + re.sub(r'.*<title>Index of (.*?)</title>.*', r'\1', line)
+            # If we found the number, keep it and break
+            elif re.search(r'.*TAR[0-9]*?\.tgz.*', line):
+                number = re.sub(r'.*TAR([0-9]*?)\.tgz.*', r'\1', line)
+                break
+
+        lm_url = path + '/' + number + '.lm'
+        dic_url = path + '/' + number + '.dic'
+
+        self._download_file(lm_url, self.config.lang_file)
+        self._download_file(dic_url, self.config.dic_file)
+
+    def save_language_hash(self):
+        new_hashes = {'language': self.new_hash}
+        with open(self.config.hash_file, 'w') as f:
+            json.dump(new_hashes, f)
+
+    def _download_file(self, url, path):
+        r = requests.get(url, stream=True)
+        if r.status_code == 200:
+            with open(path, 'wb') as f:
+                for chunk in r:
+                    f.write(chunk)