Programming For Big Data

Programming For Big Data
Darren Redmond

Developing Out Spell Checking Capabilities
Based on the tutorial below – and we’ll see where we end up in class Create a file called spellcheck.py with the following: # to begin we start with a list of words in a file called spell.words # we read the file and strip out the file endings words = open('spell.words').readlines() words = map(lambda x: x.strip(), words) # now check if the word zygotic is a word print('zygotic' in words) python spellcheck.py

Create Functions For Reuse
# create a function to load the words and to check a word is in the dictionary def load_words(file_name): words = open(file_name).readlines() words = map(lambda x: x.strip(), words) return words def check_word(words, word): return word in words words = load_words('spell.words') # now check if the word zygotic is a word print(check_word(words, 'zygotic')) python spellcheck.py

Create Function To Check A Sentence
… def check_words(words, sentence): words_to_check = sentence.split(' ') for word in words_to_check: if not check_word(words, word): print('Word is misspelt : ' + word) return False return True words = load_words('spell.words') print(check_word(words, 'zygotic')) print(check_words(words, 'zygotic mistasdas elementary'))

The Full Program Now def load_words(file_name): words = open(file_name).readlines() words = map(lambda x: x.strip(), words) return words def check_word(words, word): return word in words def check_words(words, sentence): words_to_check = sentence.split(' ') for word in words_to_check: if not check_word(words, word): print('Word is misspelt : ' + word) return False return True words = load_words('spell.words') print(check_word(words, 'zygotic')) print(check_words(words, 'zygotic mistasdas elementary'))

Create Spell Checker Class
class SpellChecker(object): def __init__(self): self.words = [] def load_words(self, file_name): self.words = open(file_name).readlines() self.words = map(lambda x: x.strip(), self.words) def check_word(self, word): return word in self.words def check_words(self, sentence): words_to_check = sentence.split(' ') for word in words_to_check: if not self.check_word(word): print('Word is misspelt : ' + word) return False return True

Use the Spell Checker Class
… # enable so that this is only called when the script run from the command line if __name__ == '__main__': spellChecker = SpellChecker() spellChecker.load_words('spell.words') print(spellChecker.check_word('zygotic')) print(spellChecker.check_words('zygotic mistasdas elementary'))

Let’s Create Some Unit Tests – spellcheck_test.py
import unittest from spellcheck import SpellChecker class TestSpellChecker(unittest.TestCase): def setUp(self): self.spellChecker = SpellChecker() self.spellChecker.load_words('spell.words') def test_spell_checker(self): self.assertTrue(self.spellChecker.check_word('zygotic')) self.assertFalse(self.spellChecker.check_words('zygotic mistasdas elementary')) self.assertTrue(self.spellChecker.check_words('our first correct sentence')) if __name__ == '__main__': unittest.main()

Let’s Find Some Bugs import unittest from spellcheck import SpellChecker class TestSpellChecker(unittest.TestCase): def setUp(self): self.spellChecker = SpellChecker() self.spellChecker.load_words('spell.words') def test_spell_checker(self): self.assertTrue(self.spellChecker.check_word('zygotic')) self.assertFalse(self.spellChecker.check_words('zygotic mistasdas elementary')) self.assertTrue(self.spellChecker.check_words('our first correct sentence')) # handle case sensitivity self.assertTrue(self.spellChecker.check_words('Our first correct sentence')) # handle full stop self.assertTrue(self.spellChecker.check_words('Our first correct sentence.')) if __name__ == '__main__': unittest.main()

Let’s Fix Some Bugs class SpellChecker(object): … def check_word(self, word): # remove full stops and ensure lower case – 2 bugs fixed. return word.strip('.').lower() in self.words

Handle Multiple Words Failing in a Sentence
import unittest from spellcheck import SpellChecker class TestSpellChecker(unittest.TestCase): def setUp(self): self.spellChecker = SpellChecker() self.spellChecker.load_words('spell.words') def test_spell_checker(self): self.assertTrue(self.spellChecker.check_word('zygotic')) self.assertFalse(self.spellChecker.check_words('zygotic mistasdas elementary')) self.assertTrue(self.spellChecker.check_words('our first correct sentence')) # handle case sensitivity self.assertTrue(self.spellChecker.check_words('Our first correct sentence')) # handle full stop self.assertTrue(self.spellChecker.check_words('Our first correct sentence.')) self.assertFalse(self.spellChecker.check_words('zygotic mistasdas spelllleeeing elementary')) …

Handle Multiple Failures in a Sentence Better
class SpellChecker(object): … def check_words(self, sentence): words_to_check = sentence.split(' ') failed_words = [] for word in words_to_check: if not self.check_word(word): print('Word is misspelt : ' + word) failed_words.append(word) return failed_words

Handle Multiple Failures in a Sentence Better
class TestSpellChecker(unittest.TestCase): … def test_spell_checker(self): self.assertTrue(self.spellChecker.check_word('zygotic')) failed_words = self.spellChecker.check_words('zygotic mistasdas elementary') self.assertEquals(1, len(failed_words)) self.assertEquals('mistasdas', failed_words[0]) self.assertEquals(0, len(self.spellChecker.check_words('our first correct sentence'))) # handle case sensitivity self.assertEquals(0, len(self.spellChecker.check_words('Our first correct sentence'))) # handle full stop self.assertEquals(0, len(self.spellChecker.check_words('Our first correct sentence.'))) failed_words = self.spellChecker.check_words('zygotic mistasdas spelllleeeing elementary') self.assertEquals(2, len(failed_words)) self.assertEquals('spelllleeeing', failed_words[1])

Handle a Document which is a list of Sentences
class SpellChecker(object): def load_file(self, file_name): lines = open(file_name).readlines() return map(lambda x: x.strip(), lines) def load_words(self, file_name): self.words = self.load_file(file_name) … def check_document(self, file_name): self.sentences = self.load_file(file_name) failed_words_in_sentences = [] index = 0 for sentence in self.sentences: failed_words_in_sentences.extend(self.check_words(sentence, index)) index = index + 1 return failed_words_in_sentences

Handle a Document which is a list of Sentences
class TestSpellChecker(unittest.TestCase): … def test_spell_checker(self): self.assertTrue(self.spellChecker.check_word('zygotic')) failed_words = self.spellChecker.check_words('zygotic mistasdas elementary') self.assertEquals(1, len(failed_words)) self.assertEquals('mistasdas', failed_words[0]) self.assertEquals(0, len(self.spellChecker.check_words('our first correct sentence'))) # handle case sensitivity self.assertEquals(0, len(self.spellChecker.check_words('Our first correct sentence'))) # handle full stop self.assertEquals(0, len(self.spellChecker.check_words('Our first correct sentence.'))) failed_words = self.spellChecker.check_words('zygotic mistasdas spelllleeeing elementary') self.assertEquals(2, len(failed_words)) self.assertEquals('spelllleeeing', failed_words[1]) # more bugs because the spell checker doesn’t spell check itself correctly – 21 entries not correct – dictionary words need to be lower self.assertEqual(21, len(self.spellChecker.check_document('spell.words')))

Handle lowering the case of the dictionary words
class SpellChecker(object): def load_file(self, file_name): lines = open(file_name).readlines() # ensures that all items read become lower case. return map(lambda x: x.strip().lower(), lines) … class TestSpellChecker(unittest.TestCase): def test_spell_checker(self): # fix the 21 issues of words not matching the dictionary self.assertEqual(0, len(self.spellChecker.check_document('spell.words')))

Tracking failed words, line number and caret position
In order to calculate the mis-spelt word, the line number, and the caret position – we will append a dict into the list of failed words instead of just the word. So: failed_words.append(word) Will become: failed_words.append({'word':word, 'line': line_number, 'pos': caret_position}) We will just need to keep track of the lines and the caret positions. So in the check_document function – we sohuld use the enumerate function to give us the index in the list: for sentence in self.sentences: failed_words_in_sentences.extend(self.check_words(sentence)) Becomes for index, sentence in enumerate(self.sentences): failed_words_in_sentences.extend(self.check_words(sentence, index))

So the check_words function will become: def check_words(self, sentence, index): words_to_check = sentence.split(' ') caret_position = 0 failed_words = [] for word in words_to_check: if not self.check_word(word): print('Word is misspelt ' + word + ' at line : ' + str(index+1) + ' pos ' + str(caret_position+1)) failed_words.append({'word':word,'line':index+1,'pos':caret_position+1}) # update the caret position to be the length of the word plus 1 for the split character. caret_position = caret_position + len(word) + 1 return failed_words

So the check_words function will become: # index = 0 is set here so that the function can be called for one line and index defaults to 0 def check_words(self, sentence, index = 0): words_to_check = sentence.split(' ') caret_position = 0 failed_words = [] for word in words_to_check: if not self.check_word(word): print('Word is misspelt ' + word + ' at line : ' + str(index+1) + ' pos ' + str(caret_position+1)) failed_words.append({'word':word,'line':index+1,'pos':caret_position+1}) # update the caret position to be the length of the word plus 1 for the split character. caret_position = caret_position + len(word) + 1 return failed_words

Updating the tests So the check_words function will become:
def test_spell_checker(self): self.assertTrue(self.spellChecker.check_word('zygotic')) failed_words = self.spellChecker.check_words('zygotic mistasdas elementary') self.assertEquals(1, len(failed_words)) self.assertEquals('mistasdas', failed_words[0]['word']) self.assertEquals(1, failed_words[0]['line']) self.assertEquals(9, failed_words[0]['pos']) self.assertEquals(0, len(self.spellChecker.check_words('our first correct sentence'))) # handle case sensitivity self.assertEquals(0, len(self.spellChecker.check_words('Our capital sentence'))) # handle full stop self.assertEquals(0, len(self.spellChecker.check_words('Our full stop sentence.'))) failed_words = self.spellChecker.check_words('zygotic mistasdas spelllleeeing elementary') self.assertEquals(2, len(failed_words)) self.assertEquals('spelllleeeing', failed_words[1]['word']) self.assertEquals(1, failed_words[1]['line']) self.assertEquals(19, failed_words[1]['pos']) self.assertEqual(0, len(self.spellChecker.check_document('spell.words')))

Look at the files spellcheck.py and spellcheck_test.py What next?
The Full Solution Look at the files spellcheck.py and spellcheck_test.py What next? How to spell check on a directory of files? – hint os.listdir How to handle a different languages? Dict with list per language.

Programming For Big Data

Similar presentations

Presentation on theme: "Programming For Big Data"— Presentation transcript:

Similar presentations

About project

Feedback

Log in

Auth with social network:

Programming For Big Data

Similar presentations

Presentation on theme: "Programming For Big Data"— Presentation transcript:

Similar presentations

About project

Feedback