miércoles, 17 de abril de 2013

Extra points: Huffman coding

2. Table4.1 gives the relative frequencies,in English prose minus punctuation and blanks, ignoring capitalization, of the alphabetic characters a, b, . . . , z, estimated by examination of a large block of English prose, believed to be typical. This table is copied, with one small change, from [6, Appendix 1]. Find an optimal (with respect to average code word length) prefix-condition encoding scheme for S = {a, b, ... , z} if


(a) A={0,1};
(b) A={0,1,∗}.
(c) What are the lengths of the shortest fixed-length encoding schemes, resulting in uniquely decodable codes, for S, in cases (a) and (b), above?

For do this exercise I made a program that are pretty similar that my last homework where by doing the huffman tree we have the optimal prefix-condition, this is the code.


class Hoja:
def __init__(self, texto, prob, bit0=None, bit1=None):
self.bit0 = bit0
self.bit1 = bit1
self.texto = texto
self.prob = prob
def mapa_codigos(arbol, alfa={}, lista=[]):
if not arbol.bit1 and not arbol.bit0:
alfa[arbol.texto] = "".join(lista)
return
lista.append("1")
mapa_codigos(arbol.bit1, alfa, lista)
lista.pop()
lista.append("0")
mapa_codigos(arbol.bit0, alfa, lista)
lista.pop()
def codificacion(cadena):
mapeo = {}
nodos = []
alfa = [('e', 12.702, ''), ('t', 9.056, ''), ('a', 8.167, ''), ('o', 7.507, ''), ('i', 6.966, ''), ('n', 6.749, ''), ('s', 6.327, ''), ('h', 6.094, ''), ('r', 5.987, ''), ('d', 4.253, ''), ('l', 4.025, ''), ('c', 2.782, ''), ('u', 2.758, ''), ('m', 2.406, ''), ('w', 2.36, ''), ('f', 2.228, ''), ('g', 2.015, ''), ('y', 1.974, ''), ('p', 1.929, ''), ('b', 1.492, ''), ('v', 0.978, ''), ('k', 0.772, ''), ('j', 0.153, ''), ('x', 0.15, ''), ('q', 0.095, ''), ('z', 0.075, '')]
for i in range(len(alfa)):
hijo = Hoja(alfa[i][0], alfa[i][1])
nodos.append((hijo.texto, hijo.prob, hijo))
while (len(nodos) != 0):
bit0 = nodos.pop(0)
try:
bit1 = nodos.pop(0)
except IndexError:
break
texto_unido = str(bit0[0]) + str(bit1[0])
suma = float(bit0[1]) + float(bit1[1])
hoja = Hoja(texto_unido, suma, bit0[2], bit1[2])
for i in range(len(nodos)):
if nodos[i][1] > suma:
index = i
break
nodos = nodos[:i] + [(hoja.texto, hoja.prob, hoja)] + nodos[i:]
mapa_codigos(hoja, mapeo)
print(mapeo)
view raw extra.py hosted with ❤ by GitHub

So we have like result for the question a:

  • 'a': '0000', 
  • 'c': '010001', 
  • 'b': '0111000',
  •  'e': '1', 
  • 'd': '00011', 
  • 'g': '0111011',
  •  'f': '001101', 
  • 'i': '01101', 
  • 'h': '01010', 
  • 'k': '00110011', 
  • 'j': '0011001011',
  •  'm': '001111', 
  • 'l': '00010', 
  • 'o': '01111',
  • 'n': '01100', 
  • 'q': '0011001001', 
  • 'p': '0111001', 
  • 's': '01011',
  •  'r': '01001', 
  • 'u': '010000', 
  • 't': '0010', 
  • 'w': '001110',
  •  'v': '0011000', 
  • 'y': '0111010', 
  • 'x': '0011001010', 
  • 'z': '0011001000'
Then I modified the code to have another node, so in the alphabet we have a new character and also a new coding data.

class Hoja:
def __init__(self, texto, prob, bit0=None, bit1=None, bit_=None):
self.bit0 = bit0
self.bit1 = bit1
self.bit_ = bit_
self.texto = texto
self.prob = prob
def mapa_codigos(arbol, alfa={}, lista=[]):
if not arbol.bit1 and not arbol.bit0:
alfa[arbol.texto] = "".join(lista)
return
lista.append("1")
mapa_codigos(arbol.bit1, alfa, lista)
lista.pop()
lista.append("0")
mapa_codigos(arbol.bit0, alfa, lista)
lista.pop()
lista.append("*")
mapa_codigos(arbol.bit_, alfa, lista)
lista.pop()
def codificacion(cadena):
#alfa = obtener_alfabeto(cadena)
mapeo = {}
nodos = []
alfa = [('e', 12.702, ''), ('t', 9.056, ''), ('a', 8.167, ''), ('o', 7.507, ''), ('i', 6.966, ''), ('n', 6.749, ''), ('s', 6.327, ''), ('h', 6.094, ''), ('r', 5.987, ''), ('d', 4.253, ''), ('l', 4.025, ''), ('c', 2.782, ''), ('u', 2.758, ''), ('m', 2.406, ''), ('w', 2.36, ''), ('f', 2.228, ''), ('g', 2.015, ''), ('y', 1.974, ''), ('p', 1.929, ''), ('b', 1.492, ''), ('v', 0.978, ''), ('k', 0.772, ''), ('j', 0.153, ''), ('x', 0.15, ''), ('q', 0.095, ''), ('z', 0.075, '')]
for i in range(len(alfa)):
hijo = Hoja(alfa[i][0], alfa[i][1])
nodos.append((hijo.texto, hijo.prob, hijo))
while (len(nodos) != 0):
try:
bit0 = nodos.pop(0)
print "bit0"
except IndexError:
break
try:
bit1 = nodos.pop(0)
print "bit1"
except IndexError:
break
try:
bit_ = nodos.pop(0)
print "bit_"
except IndexError:
break
texto_unido = str(bit0[0]) + str(bit1[0]) + str(bit_[0])
suma = float(bit0[1]) + float(bit1[1]) + float(bit_[1])
hoja = Hoja(texto_unido, suma, bit0[2], bit1[2], bit_[2])
for i in range(len(nodos)):
if nodos[i][1] > suma:
index = i
break
nodos = nodos[:i] + [(hoja.texto, hoja.prob, hoja)] + nodos[i:]
mapa_codigos(hoja, mapeo)
print(mapeo)
return mapeo
view raw hoja.py hosted with ❤ by GitHub

  • 'a': '11', 
  • 'c': '*01',
  •  'b': '*0**', 
  • 'e': '1'
  • 'd': '*11', 
  • 'g': '*1**', 
  • 'f': '0*0',
  •  'i': '01', 
  • 'h': '**1', 
  • 'k': '*0*1*', 
  • 'j': '*0*10', 
  • 'm': '0**', 
  • 'l': '*10',
  •  'o': '10', 
  • 'n': '00', 
  • 'q': '*0*111', 
  • 'p': '*1*0', 
  • 's': '***', 
  • 'r': '**0', 
  • 'u': '*00', 
  • 't': '1*', 
  • 'w': '0*1', 
  • 'v': '*0*0', 
  • 'y': '*1*1', 
  • 'x': '*0*11*', 
  • 'z': '*0*110'

For the next question we have that e are the last in the list and only have a number 1 as a bit of codification and for the question b also have e with one bit, but in general we have a code more shorter in the second one than in the first one.

1 comentario: