Roberto Martínez: Huffman coding

For this week we need to implement the tree-based Huffman coding for Unicode strings, implementing the tree structure, then we need to do a report with a design of an experiment to determine the worst-case and typical-case complexity and compression ratio.

Determining the worst-case and typical-case

As we know, fibonacci sequence are the sum of the previous two, knowing at first that Fo = 0 and F1 = 1, this sequence produces a tree that are the most unbalanced, because if we know that huffman are base in the last two frequencies which produce another node that are the sum of both, so, if we have a fibonacci sequence as a frequency of the letters that we would like to test, we can have this type of tree.

For example, let say that we have these case, when we have only 7 letter, just for know how works.

1, 1, 2, 3
a, b, cc, ddd

Alphabet and probability:
a : 0.142857 = 1/7
b : 0.142857 = 1/7
c : 0.285714 = 2/7
d : 0.428571 = 3/7

Tree:

[1] d
[] cabd
[1] b
[1] ab
[0] a
[0] cab
[0] c

The same:

cabd

/ \
d cab
/ \
c ab
/ \
a b

We can see that the tree have only new nodes just for one side, because we made the tree having the sum of the probabilities of frequency, and have a pattern that builds the tree in that way, also we can say that if the have in all the probabilities the same number, would have the same pattern.

A typical case is when we have a frequencies that are not the same for all the text or that don't have the pattern of fibonacci, so I made an experiment for the type that I think is interesting, I made a different graphics that can shows that these type of cases are bad compare to the average case.

We have in this graphic the sum of bits that need to handle with a entire alphabet, using ascii, without compress the bits in the line yellow, the average using huffman compression and then using fibonacci sequence frequency and compression with huffman, with a sample of 35 strings with different size.

In this graphic, we can see the compress ratio between the average-case and the worst-case, we can see that clearly are much difference between them, having a difference about 62% more compress than the strings that have a fibonacci sequence.

Program

	import collections
	import string
	import random
	import time

	class Hoja:
	def __init__(self, texto, prob, bit0=None, bit1=None):
	self.bit0 = bit0
	self.bit1 = bit1
	self.texto = texto
	self.prob = prob

	def genera_str(tam):
	return ''.join(random.choice(string.ascii_letters) for x in range(tam))

	def genera_str_alfa(tam, str):
	return ''.join(random.choice(str) for x in range(tam))

	def mostrar(arbol, nivel, bit=""):
	if arbol == None: return
	mostrar(arbol.bit1, nivel + 1, 1)
	print(" " * nivel + "[" + str(bit) + "] " + str(arbol.texto))
	mostrar(arbol.bit0, nivel + 1, 0)

	def obtener_alfabeto(cadena):
	dic = collections.defaultdict(int)

	for letra in cadena:
	dic[letra] += 1

	alfa = []
	total = len(cadena)
	#print ("Alfabeto & prob:")
	#print cadena
	cod = ""
	for letra in sorted(dic, key = dic.get):
	prob = float(dic[letra])/float(total)
	#print("%s : %f" % (letra, dic[letra]))
	alfa.append((letra, prob, cod))
	#time.sleep(3)
	return alfa

	def mapa_codigos(arbol, alfa={}, lista=[]):
	if not arbol.bit1 and not arbol.bit0:
	alfa[arbol.texto] = "".join(lista)
	return
	lista.append("1")
	mapa_codigos(arbol.bit1, alfa, lista)
	lista.pop()
	lista.append("0")
	mapa_codigos(arbol.bit0, alfa, lista)
	lista.pop()

	def codificacion(cadena):
	alfa = obtener_alfabeto(cadena)
	mapeo = {}
	nodos = []

	for i in range(len(alfa)):
	hijo = Hoja(alfa[i][0], alfa[i][1])
	nodos.append((hijo.texto, hijo.prob, hijo))

	while (len(nodos) != 0):
	bit0 = nodos.pop(0)
	try:
	bit1 = nodos.pop(0)
	except IndexError:
	break
	texto_unido = str(bit0[0]) + str(bit1[0])
	suma = float(bit0[1]) + float(bit1[1])
	hoja = Hoja(texto_unido, suma, bit0[2], bit1[2])
	for i in range(len(nodos)):
	if nodos[i][1] > suma:
	index = i
	break
	nodos = nodos[:i] + [(hoja.texto, hoja.prob, hoja)] + nodos[i:]
	#print("Arbol: ")
	#mostrar(hoja, 0)
	mapa_codigos(hoja, mapeo)
	#print("Mapeo:")
	#print(mapeo)
	return mapeo

	#codificacion(string.printable)

	def experimento_1(veces):
	print "datos random bits_ran prob bits_prob"
	x = 0
	for i in range(0, veces, 100000):
	if i != 0 and x < 10:
	str = (string.ascii_uppercase)*i
	inicio = time.time()
	mapa_1 = codificacion(str)
	fin = time.time()
	tiempo_prob = fin - inicio
	inv_mapa_1 = {v:k for k, v in mapa_1.items()}
	mayor_1 = len(max(inv_mapa_1, key=len))

	str_ran = genera_str(len(str))
	inicio = time.time()
	mapa_2 = codificacion(str_ran)
	fin = time.time()
	tiempo_ran = fin - inicio
	inv_mapa_2 = {v:k for k, v in mapa_2.items()}
	mayor_2 = len(max(inv_mapa_2, key=len))
	print "%s %s %s %s %s" % (len(str), tiempo_ran, mayor_2, tiempo_prob, mayor_1)
	x = x + 1

	def experimento_2():
	x = 0
	str = string.ascii_letters
	str_fibo = "a"
	b = 0
	c = 1
	#print "tam long tiempo_ran mayor_ran suma_ran compress_ran tiempo_fibo mayor_fibo suma_fibo compress_fibo"
	for i in range(len(str)):
	a = b + c
	c = b
	b = a
	if x != 0:
	str_fibo = str_fibo + str[x]*a
	#mixta = "".join(random.sample(str_fibo, len(str_fibo)))
	inicio = time.time()
	#print "Fibo:"
	mapa_1 = codificacion(str_fibo)
	fin = time.time()
	tiempo = fin - inicio
	inv_mapa_1 = {v:k for k, v in mapa_1.items()}
	mayor_1 = len(max(inv_mapa_1, key=len))
	suma_1 = sum(len(x) for x in inv_mapa_1)
	longitud = len(inv_mapa_1)*7

	alfa = str[:x+1]
	#print "Random:"
	str_ran = genera_str_alfa(len(str_fibo), alfa)
	inicio = time.time()
	mapa_2 = codificacion(str_ran)
	fin = time.time()
	tiempo_ran = fin - inicio
	inv_mapa_2 = {v:k for k, v in mapa_2.items()}
	mayor_2 = len(max(inv_mapa_2, key=len))
	suma_2 = sum(len(x) for x in inv_mapa_2)

	compress_bit_ran = float(suma_2)/float(longitud)
	compress_bit_fibo = float(suma_1)/float(longitud)

	print "%s %s %s %s %s %s %s %s %s %s" % (len(str_ran), longitud, tiempo_ran, mayor_2, suma_2, compress_bit_ran, tiempo, mayor_1, suma_1, compress_bit_fibo)
	x = x + 1

	experimento_2()

view raw arbol.py hosted with ❤ by GitHub

Roberto Martínez

jueves, 11 de abril de 2013

Huffman coding

1 comentario: