diff options
-rw-r--r-- | README.md | 28 | ||||
-rwxr-xr-x | blather.py | 34 | ||||
-rwxr-xr-x | language_updater.sh | 32 | ||||
-rw-r--r-- | languageupdater.py | 83 |
4 files changed, 94 insertions, 83 deletions
diff --git a/README.md b/README.md index b2949b6..22729a0 100644 --- a/README.md +++ b/README.md @@ -17,35 +17,21 @@ but adds a lot of features that go beyond the original purpose of Blather. ## Usage -1. Move commands.tmp to ~/.config/blather/commands.conf and fill the file with +1. Move commands.tmp to ~/.config/kaylee/commands.conf and fill the file with sentences and command to run -2. Run blather.py, this will generate ~/.config/blather/sentences.corpus based -on sentences in the 'commands' file -3. Quit Kaylee (there is a good chance it will just segfault) -4. Go to <http://www.speech.cs.cmu.edu/tools/lmtool.html> and upload the -sentences.corpus file -5. Download the resulting XXXX.lm file to the ~/.config/blather/language -directory and rename to file to 'lm' -6. Download the resulting XXXX.dic file to the ~/.config/blather/language -directory and rename to file to 'dic' -7. Run blather.py +2. Run blather.py. This will generate ~/.local/share/kaylee/sentences.corpus +based on sentences in the 'commands' file, then use +<http://www.speech.cs.cmu.edu/tools/lmtool.html> to create and save a new +language model and dictionary. * For GTK UI, run blather.py -i g * To start a UI in 'continuous' listen mode, use the -c flag * To use a microphone other than the system default, use the -m flag -8. Start talking +3. Start talking **Note:** to start Kaylee without needing to enter command line options all the -time, copy options.json.tmp to ~/.config/blather/options.json and edit +time, copy options.json.tmp to ~/.config/kaylee/options.json and edit accordingly. -### Bonus - -~~Once the sentences.corpus file has been created, run the language_updater.sh -script to automate the process of creating and downloading language files.~~ - -Kaylee now updates the language automatically. You should never need to run -language_updater.sh manually. - ### Examples * To run Kaylee with the GTK UI and start in continuous listen mode: diff --git a/blather.py b/blather.py index a90afe3..23802e8 100755 --- a/blather.py +++ b/blather.py @@ -16,6 +16,7 @@ import json from recognizer import Recognizer from config import Config +from languageupdater import LanguageUpdater class Blather: @@ -35,7 +36,7 @@ class Blather: # Read the commands self.read_commands() - if self.options['interface'] != None: + if self.options['interface']: if self.options['interface'] == "g": from gtkui import UI elif self.options['interface'] == "gt": @@ -59,7 +60,8 @@ class Blather: self.history = [] # Update the language if necessary - self.update_language() + self.language_updater = LanguageUpdater(self.config) + self.language_updater.update_language_if_changed() # Create the recognizer self.recognizer = Recognizer(self.config) @@ -95,34 +97,6 @@ class Blather: # Close the file hfile.close() - def update_language(self): - """Update the language if its hash has changed""" - # Load the stored hash from the hash file - try: - with open(self.config.hash_file, 'r') as f: - hashes = json.load(f) - stored_hash = hashes['language'] - except (IOError, KeyError, TypeError): - # No stored hash - stored_hash = '' - - # Calculate the hash the language file has right now - hasher = hashlib.sha256() - with open(self.config.strings_file, 'rb') as sfile: - buf = sfile.read() - hasher.update(buf) - new_hash = hasher.hexdigest() - - # If the hashes differ - if stored_hash != new_hash: - # Update the language - # FIXME: Do this with Python, not Bash - self.run_command('./language_updater.sh') - # Store the new hash - new_hashes = {'language': new_hash} - with open(self.config.hash_file, 'w') as f: - json.dump(new_hashes, f) - def run_command(self, cmd): """Print the command, then run it""" print(cmd) diff --git a/language_updater.sh b/language_updater.sh deleted file mode 100755 index 5a2c232..0000000 --- a/language_updater.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -blatherdir=~/.config/kaylee -blatherdatadir=~/.local/share/kaylee -blathercachedir=~/.cache/kaylee -sentences=$blatherdatadir/sentences.corpus -sourcefile=$blatherdir/commands.conf -tempfile=$blathercachedir/url.txt -lmtoolurl=http://www.speech.cs.cmu.edu/cgi-bin/tools/lmtool/run - -cd $blatherdir - -sed -f - $sourcefile > $sentences <<EOFcommands - /^$/d - /^#/d - s/\:.*$// -EOFcommands - -# upload corpus file, find the resulting dictionary file url -curl -L -F corpus=@"$sentences" -F formtype=simple $lmtoolurl \ - |grep -A 1 "base name" |grep http \ - | sed -e 's/^.*\="//' | sed -e 's/\.tgz.*$//' | sed -e 's/TAR//' > $tempfile - -# download the .dic and .lm files -curl -C - -O $(cat $tempfile).dic -curl -C - -O $(cat $tempfile).lm - -# mv em to the right name/place -mv *.dic $blatherdatadir/dic -mv *.lm $blatherdatadir/lm - -rm $tempfile diff --git a/languageupdater.py b/languageupdater.py new file mode 100644 index 0000000..a82e023 --- /dev/null +++ b/languageupdater.py @@ -0,0 +1,83 @@ +# This is part of Kaylee +# -- this code is licensed GPLv3 +# Copyright 2013 Jezra +# Copyright 2015 Clayton G. Hobbs + +import hashlib +import json +import re + +import requests + +class LanguageUpdater: + + def __init__(self, config): + self.config = config + + def update_language_if_changed(self): + """Test if the language has changed, and if it has, update it""" + if self.language_has_changed(): + self.update_language() + self.save_language_hash() + + def language_has_changed(self): + """Use SHA256 hashes to test if the language has changed""" + # Load the stored hash from the hash file + try: + with open(self.config.hash_file, 'r') as f: + hashes = json.load(f) + self.stored_hash = hashes['language'] + except (IOError, KeyError, TypeError): + # No stored hash + self.stored_hash = '' + + # Calculate the hash the language file has right now + hasher = hashlib.sha256() + with open(self.config.strings_file, 'rb') as sfile: + buf = sfile.read() + hasher.update(buf) + self.new_hash = hasher.hexdigest() + + return self.new_hash != self.stored_hash + + def update_language(self): + """Update the language using the online lmtool""" + print('Updating language using online lmtool') + + host = 'http://www.speech.cs.cmu.edu' + url = host + '/cgi-bin/tools/lmtool/run' + + # Prepare request + files = {'corpus': open(self.config.strings_file, 'rb')} + values = {'formtype': 'simple'} + + # Send corpus to the server + r = requests.post(url, files=files, data=values) + + # Parse response to get URLs of the files we need + for line in r.text.split('\n'): + # If we found the directory, keep it and don't break + if re.search(r'.*<title>Index of (.*?)</title>.*', line): + path = host + re.sub(r'.*<title>Index of (.*?)</title>.*', r'\1', line) + # If we found the number, keep it and break + elif re.search(r'.*TAR[0-9]*?\.tgz.*', line): + number = re.sub(r'.*TAR([0-9]*?)\.tgz.*', r'\1', line) + break + + lm_url = path + '/' + number + '.lm' + dic_url = path + '/' + number + '.dic' + + self._download_file(lm_url, self.config.lang_file) + self._download_file(dic_url, self.config.dic_file) + + def save_language_hash(self): + new_hashes = {'language': self.new_hash} + with open(self.config.hash_file, 'w') as f: + json.dump(new_hashes, f) + + def _download_file(self, url, path): + r = requests.get(url, stream=True) + if r.status_code == 200: + with open(path, 'wb') as f: + for chunk in r: + f.write(chunk) |