Roberto Martínez: Extra points: Huffman coding

2. Table4.1 gives the relative frequencies,in English prose minus punctuation and blanks, ignoring capitalization, of the alphabetic characters a, b, . . . , z, estimated by examination of a large block of English prose, believed to be typical. This table is copied, with one small change, from [6, Appendix 1]. Find an optimal (with respect to average code word length) prefix-condition encoding scheme for S = {a, b, ... , z} if

(a) A={0,1};
(b) A={0,1,∗}.
(c) What are the lengths of the shortest fixed-length encoding schemes, resulting in uniquely decodable codes, for S, in cases (a) and (b), above?

For do this exercise I made a program that are pretty similar that my last homework where by doing the huffman tree we have the optimal prefix-condition, this is the code.

	class Hoja:
	def __init__(self, texto, prob, bit0=None, bit1=None):
	self.bit0 = bit0
	self.bit1 = bit1
	self.texto = texto
	self.prob = prob

	def mapa_codigos(arbol, alfa={}, lista=[]):
	if not arbol.bit1 and not arbol.bit0:
	alfa[arbol.texto] = "".join(lista)
	return
	lista.append("1")
	mapa_codigos(arbol.bit1, alfa, lista)
	lista.pop()
	lista.append("0")
	mapa_codigos(arbol.bit0, alfa, lista)
	lista.pop()

	def codificacion(cadena):
	mapeo = {}
	nodos = []
	alfa = [('e', 12.702, ''), ('t', 9.056, ''), ('a', 8.167, ''), ('o', 7.507, ''), ('i', 6.966, ''), ('n', 6.749, ''), ('s', 6.327, ''), ('h', 6.094, ''), ('r', 5.987, ''), ('d', 4.253, ''), ('l', 4.025, ''), ('c', 2.782, ''), ('u', 2.758, ''), ('m', 2.406, ''), ('w', 2.36, ''), ('f', 2.228, ''), ('g', 2.015, ''), ('y', 1.974, ''), ('p', 1.929, ''), ('b', 1.492, ''), ('v', 0.978, ''), ('k', 0.772, ''), ('j', 0.153, ''), ('x', 0.15, ''), ('q', 0.095, ''), ('z', 0.075, '')]

	for i in range(len(alfa)):
	hijo = Hoja(alfa[i][0], alfa[i][1])
	nodos.append((hijo.texto, hijo.prob, hijo))

	while (len(nodos) != 0):
	bit0 = nodos.pop(0)
	try:
	bit1 = nodos.pop(0)
	except IndexError:
	break
	texto_unido = str(bit0[0]) + str(bit1[0])
	suma = float(bit0[1]) + float(bit1[1])
	hoja = Hoja(texto_unido, suma, bit0[2], bit1[2])
	for i in range(len(nodos)):
	if nodos[i][1] > suma:
	index = i
	break
	nodos = nodos[:i] + [(hoja.texto, hoja.prob, hoja)] + nodos[i:]
	mapa_codigos(hoja, mapeo)
	print(mapeo)

view raw extra.py hosted with ❤ by GitHub

So we have like result for the question a:

'a': '0000',
'c': '010001',
'b': '0111000',
'e': '1',
'd': '00011',
'g': '0111011',
'f': '001101',
'i': '01101',
'h': '01010',
'k': '00110011',
'j': '0011001011',
'm': '001111',
'l': '00010',
'o': '01111',
'n': '01100',
'q': '0011001001',
'p': '0111001',
's': '01011',
'r': '01001',
'u': '010000',
't': '0010',
'w': '001110',
'v': '0011000',
'y': '0111010',
'x': '0011001010',
'z': '0011001000'

Then I modified the code to have another node, so in the alphabet we have a new character and also a new coding data.

	class Hoja:
	def __init__(self, texto, prob, bit0=None, bit1=None, bit_=None):
	self.bit0 = bit0
	self.bit1 = bit1
	self.bit_ = bit_
	self.texto = texto
	self.prob = prob

	def mapa_codigos(arbol, alfa={}, lista=[]):
	if not arbol.bit1 and not arbol.bit0:
	alfa[arbol.texto] = "".join(lista)
	return
	lista.append("1")
	mapa_codigos(arbol.bit1, alfa, lista)
	lista.pop()
	lista.append("0")
	mapa_codigos(arbol.bit0, alfa, lista)
	lista.pop()
	lista.append("*")
	mapa_codigos(arbol.bit_, alfa, lista)
	lista.pop()

	def codificacion(cadena):
	#alfa = obtener_alfabeto(cadena)
	mapeo = {}
	nodos = []
	alfa = [('e', 12.702, ''), ('t', 9.056, ''), ('a', 8.167, ''), ('o', 7.507, ''), ('i', 6.966, ''), ('n', 6.749, ''), ('s', 6.327, ''), ('h', 6.094, ''), ('r', 5.987, ''), ('d', 4.253, ''), ('l', 4.025, ''), ('c', 2.782, ''), ('u', 2.758, ''), ('m', 2.406, ''), ('w', 2.36, ''), ('f', 2.228, ''), ('g', 2.015, ''), ('y', 1.974, ''), ('p', 1.929, ''), ('b', 1.492, ''), ('v', 0.978, ''), ('k', 0.772, ''), ('j', 0.153, ''), ('x', 0.15, ''), ('q', 0.095, ''), ('z', 0.075, '')]

	for i in range(len(alfa)):
	hijo = Hoja(alfa[i][0], alfa[i][1])
	nodos.append((hijo.texto, hijo.prob, hijo))

	while (len(nodos) != 0):
	try:
	bit0 = nodos.pop(0)
	print "bit0"
	except IndexError:
	break

	try:
	bit1 = nodos.pop(0)
	print "bit1"
	except IndexError:
	break

	try:
	bit_ = nodos.pop(0)
	print "bit_"
	except IndexError:
	break

	texto_unido = str(bit0[0]) + str(bit1[0]) + str(bit_[0])
	suma = float(bit0[1]) + float(bit1[1]) + float(bit_[1])
	hoja = Hoja(texto_unido, suma, bit0[2], bit1[2], bit_[2])
	for i in range(len(nodos)):
	if nodos[i][1] > suma:
	index = i
	break
	nodos = nodos[:i] + [(hoja.texto, hoja.prob, hoja)] + nodos[i:]
	mapa_codigos(hoja, mapeo)
	print(mapeo)
	return mapeo

view raw hoja.py hosted with ❤ by GitHub

'a': '11',
'c': '*01',
'b': '*0**',
'e': '1'
'd': '*11',
'g': '*1**',
'f': '0*0',
'i': '01',
'h': '**1',
'k': '*0*1*',
'j': '*0*10',
'm': '0**',
'l': '*10',
'o': '10',
'n': '00',
'q': '*0*111',
'p': '*1*0',
's': '***',
'r': '**0',
'u': '*00',
't': '1*',
'w': '0*1',
'v': '*0*0',
'y': '*1*1',
'x': '*0*11*',
'z': '*0*110'

For the next question we have that e are the last in the list and only have a number 1 as a bit of codification and for the question b also have e with one bit, but in general we have a code more shorter in the second one than in the first one.

Roberto Martínez

miércoles, 17 de abril de 2013

Extra points: Huffman coding

1 comentario: