Friday, July 13, 2012

Compressing Characters in Python

This is an in-place algorithm for compressing characters, e.g.
AABBCCCDDXYYZ --> A2B2C3D2XY2Z
The reason why the output isn't A2B2C3D2X1Y2Z1 is because adding 1 for only any non-repeated contagious character can make the compressed data bigger than the actual data, e.g.
ABCD --> A1B1C1D1 (A1B1C1D1 is larger than ABCD)
#!/usr/bin/env python

def compress(l):
    if len(l) == 0: return []
    counter = 1
    tmp = l[0]
    i = 1
    while i < len(l):
        if l[i] != tmp:
            new_index = update(i, counter, l)
            i = new_index + 1
            tmp = l[i]
            counter = 1
        else:
            counter += 1
        i += 1
    update(i, counter, l)
    return l

def update(current_index, counter, l):
    if counter == 1: return current_index - 1
    new_index = current_index - (counter - 1)
    l[new_index] = str(counter)
    shift(current_index, new_index+1, l)
    return new_index

def shift(source_index, destination_index, l):
    last_index = len(l)
    i = 0
    while (source_index + i) < last_index:
        l[destination_index+i] = l[source_index+i]
        i += 1
    last_index = destination_index + i
    del l[last_index:]

if __name__ == "__main__":
    assert (['A', '4', 
            'B', '2', 
            'C', 
            'D', '3', 
            'E', '2', 
            'A', '2', 
            'X', 
            'Y', '5', 
            'Z'] 
            == 
            compress(["A", "A", "A", "A",
                      "B", "B",
                      "C",
                      "D", "D", "D",
                      "E", "E",
                      "A", "A",
                      "X",
                      "Y", "Y", "Y", "Y", "Y",
                      "Z"]))

No comments:

Post a Comment