diff options
author | Clayton G. Hobbs <clay@lakeserv.net> | 2015-12-27 17:00:45 -0500 |
---|---|---|
committer | Clayton G. Hobbs <clay@lakeserv.net> | 2015-12-27 17:00:45 -0500 |
commit | e4b693b2061a0e3d93feba4fa570df7424bbe0d4 (patch) | |
tree | d436cb5242d33971fe37949c83583ad4a9e15fc6 | |
parent | c5578954ed54a8569014105fd75aa5fe07ba1c89 (diff) |
Rewrote language_updater.sh in Python
At the same time, I moved the logic to check if the language should be updated into the new LanguageUpdater class. The README has been updated to reflect the fact that you no longer need to do any of this manually ever.
-rw-r--r-- | README.md | 28 | ||||
-rwxr-xr-x | blather.py | 34 | ||||
-rwxr-xr-x | language_updater.sh | 32 | ||||
-rw-r--r-- | languageupdater.py | 83 |
4 files changed, 94 insertions, 83 deletions
diff --git a/README.md b/README.md index b2949b6..22729a0 100644 --- a/README.md +++ b/README.md @@ -17,35 +17,21 @@ but adds a lot of features that go beyond the original purpose of Blather. ## Usage -1. Move commands.tmp to ~/.config/blather/commands.conf and fill the file with +1. Move commands.tmp to ~/.config/kaylee/commands.conf and fill the file with sentences and command to run -2. Run blather.py, this will generate ~/.config/blather/sentences.corpus based -on sentences in the 'commands' file -3. Quit Kaylee (there is a good chance it will just segfault) -4. Go to <http://www.speech.cs.cmu.edu/tools/lmtool.html> and upload the -sentences.corpus file -5. Download the resulting XXXX.lm file to the ~/.config/blather/language -directory and rename to file to 'lm' -6. Download the resulting XXXX.dic file to the ~/.config/blather/language -directory and rename to file to 'dic' -7. Run blather.py +2. Run blather.py. This will generate ~/.local/share/kaylee/sentences.corpus +based on sentences in the 'commands' file, then use +<http://www.speech.cs.cmu.edu/tools/lmtool.html> to create and save a new +language model and dictionary. * For GTK UI, run blather.py -i g * To start a UI in 'continuous' listen mode, use the -c flag * To use a microphone other than the system default, use the -m flag -8. Start talking +3. Start talking **Note:** to start Kaylee without needing to enter command line options all the -time, copy options.json.tmp to ~/.config/blather/options.json and edit +time, copy options.json.tmp to ~/.config/kaylee/options.json and edit accordingly. -### Bonus - -~~Once the sentences.corpus file has been created, run the language_updater.sh -script to automate the process of creating and downloading language files.~~ - -Kaylee now updates the language automatically. You should never need to run -language_updater.sh manually. - ### Examples * To run Kaylee with the GTK UI and start in continuous listen mode: diff --git a/blather.py b/blather.py index a90afe3..23802e8 100755 --- a/blather.py +++ b/blather.py @@ -16,6 +16,7 @@ import json from recognizer import Recognizer from config import Config +from languageupdater import LanguageUpdater class Blather: @@ -35,7 +36,7 @@ class Blather: # Read the commands self.read_commands() - if self.options['interface'] != None: + if self.options['interface']: if self.options['interface'] == "g": from gtkui import UI elif self.options['interface'] == "gt": @@ -59,7 +60,8 @@ class Blather: self.history = [] # Update the language if necessary - self.update_language() + self.language_updater = LanguageUpdater(self.config) + self.language_updater.update_language_if_changed() # Create the recognizer self.recognizer = Recognizer(self.config) @@ -95,34 +97,6 @@ class Blather: # Close the file hfile.close() - def update_language(self): - """Update the language if its hash has changed""" - # Load the stored hash from the hash file - try: - with open(self.config.hash_file, 'r') as f: - hashes = json.load(f) - stored_hash = hashes['language'] - except (IOError, KeyError, TypeError): - # No stored hash - stored_hash = '' - - # Calculate the hash the language file has right now - hasher = hashlib.sha256() - with open(self.config.strings_file, 'rb') as sfile: - buf = sfile.read() - hasher.update(buf) - new_hash = hasher.hexdigest() - - # If the hashes differ - if stored_hash != new_hash: - # Update the language - # FIXME: Do this with Python, not Bash - self.run_command('./language_updater.sh') - # Store the new hash - new_hashes = {'language': new_hash} - with open(self.config.hash_file, 'w') as f: - json.dump(new_hashes, f) - def run_command(self, cmd): """Print the command, then run it""" print(cmd) diff --git a/language_updater.sh b/language_updater.sh deleted file mode 100755 index 5a2c232..0000000 --- a/language_updater.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -blatherdir=~/.config/kaylee -blatherdatadir=~/.local/share/kaylee -blathercachedir=~/.cache/kaylee -sentences=$blatherdatadir/sentences.corpus -sourcefile=$blatherdir/commands.conf -tempfile=$blathercachedir/url.txt -lmtoolurl=http://www.speech.cs.cmu.edu/cgi-bin/tools/lmtool/run - -cd $blatherdir - -sed -f - $sourcefile > $sentences <<EOFcommands - /^$/d - /^#/d - s/\:.*$// -EOFcommands - -# upload corpus file, find the resulting dictionary file url -curl -L -F corpus=@"$sentences" -F formtype=simple $lmtoolurl \ - |grep -A 1 "base name" |grep http \ - | sed -e 's/^.*\="//' | sed -e 's/\.tgz.*$//' | sed -e 's/TAR//' > $tempfile - -# download the .dic and .lm files -curl -C - -O $(cat $tempfile).dic -curl -C - -O $(cat $tempfile).lm - -# mv em to the right name/place -mv *.dic $blatherdatadir/dic -mv *.lm $blatherdatadir/lm - -rm $tempfile diff --git a/languageupdater.py b/languageupdater.py new file mode 100644 index 0000000..a82e023 --- /dev/null +++ b/languageupdater.py @@ -0,0 +1,83 @@ +# This is part of Kaylee +# -- this code is licensed GPLv3 +# Copyright 2013 Jezra +# Copyright 2015 Clayton G. Hobbs + +import hashlib +import json +import re + +import requests + +class LanguageUpdater: + + def __init__(self, config): + self.config = config + + def update_language_if_changed(self): + """Test if the language has changed, and if it has, update it""" + if self.language_has_changed(): + self.update_language() + self.save_language_hash() + + def language_has_changed(self): + """Use SHA256 hashes to test if the language has changed""" + # Load the stored hash from the hash file + try: + with open(self.config.hash_file, 'r') as f: + hashes = json.load(f) + self.stored_hash = hashes['language'] + except (IOError, KeyError, TypeError): + # No stored hash + self.stored_hash = '' + + # Calculate the hash the language file has right now + hasher = hashlib.sha256() + with open(self.config.strings_file, 'rb') as sfile: + buf = sfile.read() + hasher.update(buf) + self.new_hash = hasher.hexdigest() + + return self.new_hash != self.stored_hash + + def update_language(self): + """Update the language using the online lmtool""" + print('Updating language using online lmtool') + + host = 'http://www.speech.cs.cmu.edu' + url = host + '/cgi-bin/tools/lmtool/run' + + # Prepare request + files = {'corpus': open(self.config.strings_file, 'rb')} + values = {'formtype': 'simple'} + + # Send corpus to the server + r = requests.post(url, files=files, data=values) + + # Parse response to get URLs of the files we need + for line in r.text.split('\n'): + # If we found the directory, keep it and don't break + if re.search(r'.*<title>Index of (.*?)</title>.*', line): + path = host + re.sub(r'.*<title>Index of (.*?)</title>.*', r'\1', line) + # If we found the number, keep it and break + elif re.search(r'.*TAR[0-9]*?\.tgz.*', line): + number = re.sub(r'.*TAR([0-9]*?)\.tgz.*', r'\1', line) + break + + lm_url = path + '/' + number + '.lm' + dic_url = path + '/' + number + '.dic' + + self._download_file(lm_url, self.config.lang_file) + self._download_file(dic_url, self.config.dic_file) + + def save_language_hash(self): + new_hashes = {'language': self.new_hash} + with open(self.config.hash_file, 'w') as f: + json.dump(new_hashes, f) + + def _download_file(self, url, path): + r = requests.get(url, stream=True) + if r.status_code == 200: + with open(path, 'wb') as f: + for chunk in r: + f.write(chunk) |