Determining the worst-case and typical-case
As we know, fibonacci sequence are the sum of the previous two, knowing at first that Fo = 0 and F1 = 1, this sequence produces a tree that are the most unbalanced, because if we know that huffman are base in the last two frequencies which produce another node that are the sum of both, so, if we have a fibonacci sequence as a frequency of the letters that we would like to test, we can have this type of tree.
For example, let say that we have these case, when we have only 7 letter, just for know how works.
1, 1, 2, 3
a, b, cc, ddd
Alphabet and probability:
a : 0.142857 = 1/7
b : 0.142857 = 1/7
c : 0.285714 = 2/7
d : 0.428571 = 3/7
Tree:
[1] d
[] cabd
[1] b
[1] ab
[0] a
[0] cab
[0] c
The same:
cabd
/ \
d cab
/ \
c ab
/ \
a b
We can see that the tree have only new nodes just for one side, because we made the tree having the sum of the probabilities of frequency, and have a pattern that builds the tree in that way, also we can say that if the have in all the probabilities the same number, would have the same pattern.
A typical case is when we have a frequencies that are not the same for all the text or that don't have the pattern of fibonacci, so I made an experiment for the type that I think is interesting, I made a different graphics that can shows that these type of cases are bad compare to the average case.
In this graphic, we can see the compress ratio between the average-case and the worst-case, we can see that clearly are much difference between them, having a difference about 62% more compress than the strings that have a fibonacci sequence.
Program
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import collections | |
import string | |
import random | |
import time | |
class Hoja: | |
def __init__(self, texto, prob, bit0=None, bit1=None): | |
self.bit0 = bit0 | |
self.bit1 = bit1 | |
self.texto = texto | |
self.prob = prob | |
def genera_str(tam): | |
return ''.join(random.choice(string.ascii_letters) for x in range(tam)) | |
def genera_str_alfa(tam, str): | |
return ''.join(random.choice(str) for x in range(tam)) | |
def mostrar(arbol, nivel, bit=""): | |
if arbol == None: return | |
mostrar(arbol.bit1, nivel + 1, 1) | |
print(" " * nivel + "[" + str(bit) + "] " + str(arbol.texto)) | |
mostrar(arbol.bit0, nivel + 1, 0) | |
def obtener_alfabeto(cadena): | |
dic = collections.defaultdict(int) | |
for letra in cadena: | |
dic[letra] += 1 | |
alfa = [] | |
total = len(cadena) | |
#print ("Alfabeto & prob:") | |
#print cadena | |
cod = "" | |
for letra in sorted(dic, key = dic.get): | |
prob = float(dic[letra])/float(total) | |
#print("%s : %f" % (letra, dic[letra])) | |
alfa.append((letra, prob, cod)) | |
#time.sleep(3) | |
return alfa | |
def mapa_codigos(arbol, alfa={}, lista=[]): | |
if not arbol.bit1 and not arbol.bit0: | |
alfa[arbol.texto] = "".join(lista) | |
return | |
lista.append("1") | |
mapa_codigos(arbol.bit1, alfa, lista) | |
lista.pop() | |
lista.append("0") | |
mapa_codigos(arbol.bit0, alfa, lista) | |
lista.pop() | |
def codificacion(cadena): | |
alfa = obtener_alfabeto(cadena) | |
mapeo = {} | |
nodos = [] | |
for i in range(len(alfa)): | |
hijo = Hoja(alfa[i][0], alfa[i][1]) | |
nodos.append((hijo.texto, hijo.prob, hijo)) | |
while (len(nodos) != 0): | |
bit0 = nodos.pop(0) | |
try: | |
bit1 = nodos.pop(0) | |
except IndexError: | |
break | |
texto_unido = str(bit0[0]) + str(bit1[0]) | |
suma = float(bit0[1]) + float(bit1[1]) | |
hoja = Hoja(texto_unido, suma, bit0[2], bit1[2]) | |
for i in range(len(nodos)): | |
if nodos[i][1] > suma: | |
index = i | |
break | |
nodos = nodos[:i] + [(hoja.texto, hoja.prob, hoja)] + nodos[i:] | |
#print("Arbol: ") | |
#mostrar(hoja, 0) | |
mapa_codigos(hoja, mapeo) | |
#print("Mapeo:") | |
#print(mapeo) | |
return mapeo | |
#codificacion(string.printable) | |
def experimento_1(veces): | |
print "datos random bits_ran prob bits_prob" | |
x = 0 | |
for i in range(0, veces, 100000): | |
if i != 0 and x < 10: | |
str = (string.ascii_uppercase)*i | |
inicio = time.time() | |
mapa_1 = codificacion(str) | |
fin = time.time() | |
tiempo_prob = fin - inicio | |
inv_mapa_1 = {v:k for k, v in mapa_1.items()} | |
mayor_1 = len(max(inv_mapa_1, key=len)) | |
str_ran = genera_str(len(str)) | |
inicio = time.time() | |
mapa_2 = codificacion(str_ran) | |
fin = time.time() | |
tiempo_ran = fin - inicio | |
inv_mapa_2 = {v:k for k, v in mapa_2.items()} | |
mayor_2 = len(max(inv_mapa_2, key=len)) | |
print "%s %s %s %s %s" % (len(str), tiempo_ran, mayor_2, tiempo_prob, mayor_1) | |
x = x + 1 | |
def experimento_2(): | |
x = 0 | |
str = string.ascii_letters | |
str_fibo = "a" | |
b = 0 | |
c = 1 | |
#print "tam long tiempo_ran mayor_ran suma_ran compress_ran tiempo_fibo mayor_fibo suma_fibo compress_fibo" | |
for i in range(len(str)): | |
a = b + c | |
c = b | |
b = a | |
if x != 0: | |
str_fibo = str_fibo + str[x]*a | |
#mixta = "".join(random.sample(str_fibo, len(str_fibo))) | |
inicio = time.time() | |
#print "Fibo:" | |
mapa_1 = codificacion(str_fibo) | |
fin = time.time() | |
tiempo = fin - inicio | |
inv_mapa_1 = {v:k for k, v in mapa_1.items()} | |
mayor_1 = len(max(inv_mapa_1, key=len)) | |
suma_1 = sum(len(x) for x in inv_mapa_1) | |
longitud = len(inv_mapa_1)*7 | |
alfa = str[:x+1] | |
#print "Random:" | |
str_ran = genera_str_alfa(len(str_fibo), alfa) | |
inicio = time.time() | |
mapa_2 = codificacion(str_ran) | |
fin = time.time() | |
tiempo_ran = fin - inicio | |
inv_mapa_2 = {v:k for k, v in mapa_2.items()} | |
mayor_2 = len(max(inv_mapa_2, key=len)) | |
suma_2 = sum(len(x) for x in inv_mapa_2) | |
compress_bit_ran = float(suma_2)/float(longitud) | |
compress_bit_fibo = float(suma_1)/float(longitud) | |
print "%s %s %s %s %s %s %s %s %s %s" % (len(str_ran), longitud, tiempo_ran, mayor_2, suma_2, compress_bit_ran, tiempo, mayor_1, suma_1, compress_bit_fibo) | |
x = x + 1 | |
experimento_2() |
Está chido lo de Fibonacci. Para lo de caso típico, textos de verdad tendrían sentido. 7+7 pts.
ResponderEliminar