diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..e3d98719e3789fa70bb830af677e3fb3a663b9ad --- /dev/null +++ b/src/Makefile @@ -0,0 +1,52 @@ +# This is the Makefile used for the sample solution. The list of files of +# object codes are likely different in the students solution and should +# be adapted. +.PHONY: clean exclean exercise test +CC?=clang +LD=${CC} +CFLAGS=-g -Wall -Werror -O3 -Wunused-parameter -Wpointer-arith +LDFLAGS=-g + +# only keep the files needed +CCOBJ=calc_compression.o huffman.o priorityqueue.o distribution.o \ + #inputsequence.o +HUFFOBJ=huffman-main.o codebook.o priorityqueue.o huffman.o \ + bitbuffer.o distribution.o #parse-fastq.o inputsequence.o +FQCOBJ=fastqcompress-mn.o parse-fastq.o inputsequence.o huffman.o \ + priorityqueue.o codebook.o + +# only keep the goal needed, so delete huffman.x and fastqcompress.x and +# add them back later +all: huffman.x #calc_compression.x #fastqcompress.x + +# generic compilation rule which creates dependency file on the fly +%.o: %.c + $(CC) -c $< -o $@ $(CFLAGS) -MT $@ -MMD -MP -MF $(@:.o=.d) + +# This is the programm which constructs the huffman code and outputs it +calc_compression.x: $(CCOBJ) + ${LD} $(CCOBJ) -o $@ ${LDFLAGS} + +# This is the programm which constructs the huffman code from files, +# applies it to the input and outputs it to a file with suffix .hf +# Using the option -d the .hf is decompressed and written to stdout +huffman.x: $(HUFFOBJ) + ${LD} $(HUFFOBJ) -o $@ ${LDFLAGS} + +# This is the program which parses fastq-files and outputs them +# in compressed form with different methods applied to the header +# and the sequences. + +fastqcompress.x: ${FQCOBJ} + ${LD} ${FQCOBJ} -o $@ ${LDFLAGS} + +.PHONY:test +test: + ./run_huffman.sh + +clean: + ${RM} *.[oxd] *~ *.hf *.hf.decode *.hf.decode.fast + ${RM} -r *.x.dSYM/ + +# read dependencies +-include $(wildcard *.d) diff --git a/src/README b/src/README new file mode 100644 index 0000000000000000000000000000000000000000..c77bddd4cbf63520523bdbf8676ae8dd92aa48f5 --- /dev/null +++ b/src/README @@ -0,0 +1,18 @@ +This directory contains material for the project on compressing short +read data: + +- shaks.data is for testing the calc_compression.x program +- shaks.data.result is the expected result of the calc_compression.x program + when applied to shaks.data +- huffman.h provides function headers and types for the interface of the + Huffman encoding +- priorityqueue.h provides function headers for the priorityqueue +- priorityqueue.c implements the the priorityqueue +- bitbuffer.h provides function header and types for a buffer to write/read + units of bits of variable size to/from a file +- bitbuffer.c.anfang provides the initial part of the file implementing the + bitbuffer. +- the dir fastq-files contains several files to test the compression program to + be developed +- Makefile provides goals to compile the sources. Needs to be adapted + to own needs diff --git a/src/bitbuffer.c b/src/bitbuffer.c new file mode 100644 index 0000000000000000000000000000000000000000..f05e74c75ac01da3b7cfac55f57ffc25674acd59 --- /dev/null +++ b/src/bitbuffer.c @@ -0,0 +1,108 @@ +#include <inttypes.h> +#include <assert.h> +#include <stdlib.h> +#include "bitbuffer.h" + +#define GT_BITSINBYTEBUFFER 64U + +struct GtBitbuffer { + unsigned int remainingbitsinbuf; /* The number of bits which currently + do not store any values */ + uint64_t currentbitbuffer, /* this stores the bits of the buffer */ + numberofallvalues, /* number of values already stored */ + readvalue; /* This is used for the bitbuffer in reading + mode. It is used for reading the next + 64 bits/8 bytes from the input stream. */ + FILE *fp; /* Either the output stream or the input + stream */ +}; + +GtBitbuffer *gt_bitbuffer_new(FILE *outfp) { + GtBitbuffer *bitbuffer = malloc(sizeof(*bitbuffer)); + assert(bitbuffer != NULL); + bitbuffer->remainingbitsinbuf = GT_BITSINBYTEBUFFER; + bitbuffer->currentbitbuffer = 0; + bitbuffer->numberofallvalues = 0; + bitbuffer->readvalue = 0; + bitbuffer->fp = outfp; + + return bitbuffer; +} + +void gt_bitbuffer_next_value(GtBitbuffer *bb, unsigned long value, + unsigned int bitsforvalue) { + // Fall 1: Code passt genau in den Buffer + if (bb->remainingbitsinbuf == bitsforvalue) { + bb->currentbitbuffer |= value; + fwrite(&bb->currentbitbuffer, sizeof(bb->currentbitbuffer), 1, bb->fp); + bb->remainingbitsinbuf = GT_BITSINBYTEBUFFER; + bb->currentbitbuffer = 0; + } + // Fall 2: Code passt in den Buffer, füllt ihn aber nicht komplett + else if (bb->remainingbitsinbuf > bitsforvalue) { + value = value << (bb->remainingbitsinbuf - bitsforvalue); + bb->currentbitbuffer |= value; + bb->remainingbitsinbuf -= bitsforvalue; + } + // Fall 3: Code passt nicht komplett in den Buffer + else if (bb->remainingbitsinbuf < bitsforvalue) { + unsigned long value_copy = value; + unsigned int bitsforvalue_remain; + + value = value >> (bitsforvalue - bb->remainingbitsinbuf); + bitsforvalue_remain = bitsforvalue - bb->remainingbitsinbuf; + bb->currentbitbuffer |= value; + fwrite(&bb->currentbitbuffer, sizeof(bb->currentbitbuffer), 1, bb->fp); + bb->remainingbitsinbuf = GT_BITSINBYTEBUFFER; + bb->currentbitbuffer = (value_copy + << (bb->remainingbitsinbuf - bitsforvalue_remain)); + bb->remainingbitsinbuf -= bitsforvalue_remain; + } + bb->numberofallvalues++; +} + +void gt_bitbuffer_delete(GtBitbuffer *bb) { + fwrite(&bb->currentbitbuffer, sizeof(bb->currentbitbuffer), 1, bb->fp); + free(bb); +} + +GtBitbuffer *gt_bitbuffer_new_get(FILE *infp) { + GtBitbuffer *bitbuffer = malloc(sizeof(*bitbuffer)); + assert(bitbuffer != NULL); + bitbuffer->remainingbitsinbuf = 0; + bitbuffer->currentbitbuffer = 0; + bitbuffer->readvalue = 0; + bitbuffer->numberofallvalues = 0; + bitbuffer->fp = infp; + + return bitbuffer; +} + +unsigned long gt_bitbuffer_next_bit(GtBitbuffer *bb) { + if (bb->remainingbitsinbuf == 0) { + if (fread(&bb->currentbitbuffer, sizeof(bb->currentbitbuffer), 1, + bb->fp) != 1) { + + } + bb->remainingbitsinbuf = GT_BITSINBYTEBUFFER; + } + + bb->readvalue = bb->currentbitbuffer >> (GT_BITSINBYTEBUFFER - 1); + + bb->currentbitbuffer = bb->currentbitbuffer << 1; + bb->remainingbitsinbuf--; + + return bb->readvalue; +} + +/* Read the next <bitsforvalue> bits from <bb>. */ +unsigned long gt_bitbuffer_next_value_get(GtBitbuffer *bb, + unsigned int bitsforvalue) { + unsigned long value = 0; + while (bitsforvalue > 0) { + value = value << 1; + value += gt_bitbuffer_next_bit(bb); + bitsforvalue--; + } + return value; +} diff --git a/src/bitbuffer.d b/src/bitbuffer.d new file mode 100644 index 0000000000000000000000000000000000000000..0a1578ce2bb114851898fecbde69b23e6a35b2f5 --- /dev/null +++ b/src/bitbuffer.d @@ -0,0 +1,3 @@ +bitbuffer.o: bitbuffer.c bitbuffer.h + +bitbuffer.h: diff --git a/src/bitbuffer.h b/src/bitbuffer.h new file mode 100644 index 0000000000000000000000000000000000000000..62945d7aa249ee21a5256f0efbaeb8ff9e1aae3c --- /dev/null +++ b/src/bitbuffer.h @@ -0,0 +1,54 @@ +/* + Copyright (c) 2013 Stefan Kurtz <kurtz@zbh.uni-hamburg.de> + Copyright (c) 2013 Center for Bioinformatics, University of Hamburg + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +#ifndef BITBUFFER_H +#define BITBUFFER_H + +#include <stdio.h> + +/* The <GtBitbuffer> class provides methods to sequentially write + bit-compressed integers to file as well as methods to read these + from a file.*/ +typedef struct GtBitbuffer GtBitbuffer; + +/* Creates a new <GtBitbuffer> for output to <outfp>. */ +GtBitbuffer* gt_bitbuffer_new(FILE *outfp); + +/* Appends <value> of <bitsforvalue> bits to <bb>. + <bitsforvalue> must not be larger than GT_BITSINBYTEBUFFER. */ +void gt_bitbuffer_next_value(GtBitbuffer *bb, unsigned long value, + unsigned int bitsforvalue); + +/* Append unary code of <value> to <bb>. The unary code consists of + <value> 0's followed by a single 1. <value> can be larger than + GT_BITSINBYTEBUFFER. */ +void gt_bitbuffer_next_value_unary_code (GtBitbuffer *bb, unsigned long value); + +/* Deletes <bb> and frees all associated memory. */ +void gt_bitbuffer_delete(GtBitbuffer *bb); + +/* Return number of bytes used for the bitbuffer */ +unsigned int gt_bitbuffer_bytes(void); + +/* Creates a new <GtBitbuffer> for input from <infp>. */ +GtBitbuffer *gt_bitbuffer_new_get(FILE *infp); + +/* Read the next <bitsforvalue> bits from <bb>. */ +unsigned long gt_bitbuffer_next_value_get (GtBitbuffer *bb, + unsigned int bitsforvalue); + +#endif diff --git a/src/bitbuffer.o b/src/bitbuffer.o new file mode 100644 index 0000000000000000000000000000000000000000..406f4cbc6316b58fc8d35de28b4824f3edc892e5 Binary files /dev/null and b/src/bitbuffer.o differ diff --git a/src/calc_compression.c b/src/calc_compression.c new file mode 100644 index 0000000000000000000000000000000000000000..52eaceb1a754957125b9cbc70269b5de33037a27 --- /dev/null +++ b/src/calc_compression.c @@ -0,0 +1,123 @@ +#include <stdio.h> +#include <stdlib.h> +#include <limits.h> +#include <ctype.h> +#include <assert.h> +#include <string.h> +#include <time.h> +#include "calc_compression.h" +#include "huffman.h" +#include "distribution.h" + +unsigned long file_size(const char *filename) { + long filesize; + FILE *file = fopen(filename, "rb"); + + assert (file != NULL); + + fseek(file, 0, SEEK_END); + + filesize = ftell(file); + fclose(file); + assert (filesize >= 0); + return (unsigned long) filesize; +} + +char *bitsequence_to_string(Bitsequence bs) { + unsigned int idx; + const unsigned int bits = CHAR_BIT * sizeof(Bitsequence); + Bitsequence mask; + char *buffer; + + buffer = malloc((bits + 1) * sizeof(*buffer)); + assert(buffer != NULL); + for (idx = 0, mask = ((Bitsequence) 1) << (bits - 1); + mask > 0; idx++, mask >>= 1) { + buffer[idx] = (bs & mask) ? '1' : '0'; + } + assert(idx == bits); + buffer[idx] = '\0'; + return buffer; +} + +void print_code(unsigned int length, + unsigned int code, + unsigned int symbolnum, + unsigned long count, + void *data) { + data = NULL; + + if (count > 0) { + if (isprint(symbolnum)) { + printf("%c\t%lu\t%d\t", symbolnum, count, length); + } else { + printf("\\%d\t%lu\t%d\t", symbolnum, count, length); + } + char *buffer; + + buffer = bitsequence_to_string((Bitsequence) code); + printf("%s\n", buffer + (strlen(buffer) - length)); + free(buffer); + } +} + +unsigned long get_bytes_for_bits(uint64_t bits) { + if (bits % 8 == 0) + return bits / 8; + else + return bits / 8 + 1; +} + +double get_compression_ratio(uint64_t compressed_size, + unsigned long file_size) { + if (file_size == 0) + return 0; + + return (double) compressed_size / (double) file_size; +} + +int main(int argc, char *argv[]) { +#undef TIME +#ifdef TIME + clock_t begin, end; + float runtime; + begin = clock(); +#endif + if (argc != 2) { + fprintf(stderr, "%s Invalid count of arguments\n", argv[0]); + return EXIT_FAILURE; + } + unsigned long dist[UCHAR_MAX + 1] = {0}; + + for (int i = 1; i < argc; i++) { + distribution_update(dist, argv[0], argv[i]); + } + Huffman_encoding *h_enc; + + h_enc = huffman_encoding_new(dist, UCHAR_MAX + 1); + printf("# symbol\tfreq\tcodelen\tcode\n"); + + huffman_encoding_traversal(h_enc, print_code, NULL); + + unsigned long fileSize = file_size(argv[1]); + uint64_t compressed_size = huffman_encoding_size(h_enc); + unsigned long compressed_bytes = get_bytes_for_bits(compressed_size); + double ratio = get_compression_ratio(compressed_size, fileSize); + + printf("# file\t%s\n", argv[1]); + printf("# filesize (bytes)\t%lu\n", fileSize); + printf("# compressed size (bits)\t%lu\n", compressed_size); + printf("# compressed size (bytes)\t%lu\n", compressed_bytes); + if (fileSize > 0) + printf("# compression ratio (bits/byte)\t%.2f\n", ratio); + + huffman_encoding_delete(h_enc); + +#ifdef TIME + end = clock(); + runtime = end-begin; + runtime /= CLOCKS_PER_SEC; + printf("Laufzeit: %f Sekunden\n", runtime); +#endif + exit(EXIT_SUCCESS); +} \ No newline at end of file diff --git a/src/calc_compression.h b/src/calc_compression.h new file mode 100644 index 0000000000000000000000000000000000000000..4c9b828e5d45ad9fc3d66b757b098a89dd189182 --- /dev/null +++ b/src/calc_compression.h @@ -0,0 +1,24 @@ +#ifndef MY_PFN_PROJEKT_2020_CALC_COMPRESSION_H +#define MY_PFN_PROJEKT_2020_CALC_COMPRESSION_H + +#include <stdio.h> +#include <stdint.h> + +unsigned long file_size(const char *filename); + +typedef unsigned int Bitsequence; + +char *bitsequence_to_string(Bitsequence bs); + +void print_code(unsigned int length, + unsigned int code, + unsigned int symbolnum, + unsigned long count, + void *data); + +unsigned long get_bytes_for_bits(uint64_t bits); + +double get_compression_ratio(uint64_t compressed_size, + unsigned long file_size); + +#endif //MY_PFN_PROJEKT_2020_CALC_COMPRESSION_H \ No newline at end of file diff --git a/src/codebook.c b/src/codebook.c new file mode 100644 index 0000000000000000000000000000000000000000..e3d5cf6175a88725d81af123f3dbf8c833e84ed3 --- /dev/null +++ b/src/codebook.c @@ -0,0 +1,30 @@ +#include <limits.h> +#include <stdlib.h> +#include <assert.h> +#include "codebook.h" + +void add_entry(unsigned int length, + unsigned int code, + unsigned int symbolnum, + unsigned long count, + void *data) { + Codebook_entry *codebook = (Codebook_entry *) data; + codebook[symbolnum].count = count; + codebook[symbolnum].code = code; + codebook[symbolnum].length = length; +} + +Codebook_entry *codebook_new(Huffman_encoding *h_enc) { + + Codebook_entry *codebook = calloc(UCHAR_MAX + 1, + sizeof(*codebook)); + + assert(codebook != NULL); + huffman_encoding_traversal(h_enc, add_entry, codebook); + + return codebook; +} + +void codebook_delete(Codebook_entry *codebook) { + free(codebook); +} diff --git a/src/codebook.d b/src/codebook.d new file mode 100644 index 0000000000000000000000000000000000000000..5797d4667042f2ccf2bf4d52679428cf0a2ad0aa --- /dev/null +++ b/src/codebook.d @@ -0,0 +1,5 @@ +codebook.o: codebook.c codebook.h huffman.h + +codebook.h: + +huffman.h: diff --git a/src/codebook.h b/src/codebook.h new file mode 100644 index 0000000000000000000000000000000000000000..8dd0d6ec9a29befbe14564c81e90159adc6125fe --- /dev/null +++ b/src/codebook.h @@ -0,0 +1,26 @@ +#ifndef MY_PFN_PROJEKT_2020_CODEBOOK_H +#define MY_PFN_PROJEKT_2020_CODEBOOK_H + +#include <inttypes.h> +#include <stdio.h> +#include "huffman.h" + +typedef struct Codebook_entry Codebook_entry; + +struct Codebook_entry { + unsigned long count; + unsigned int code; + unsigned int length; +}; + +void add_entry(unsigned int length, + unsigned int code, + unsigned int symbolnum, + unsigned long count, + void *data); + +Codebook_entry *codebook_new(Huffman_encoding *h_enc); + +void codebook_delete(Codebook_entry *codebook); + +#endif //MY_PFN_PROJEKT_2020_CODEBOOK_H diff --git a/src/codebook.o b/src/codebook.o new file mode 100644 index 0000000000000000000000000000000000000000..b30951fb2035236ca616c12b775a2d8ab36dda82 Binary files /dev/null and b/src/codebook.o differ diff --git a/src/distribution.c b/src/distribution.c new file mode 100644 index 0000000000000000000000000000000000000000..974c2d29a892b80c065e8eb5436086bc4a31ec6e --- /dev/null +++ b/src/distribution.c @@ -0,0 +1,94 @@ +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <ctype.h> +#include <string.h> +#include <assert.h> +#include "distribution.h" + +unsigned long distribution_update(unsigned long *dist, const char *progname, + const char *filename) { + unsigned int currentCharacter; + unsigned long filesize = 0; + FILE *file = fopen(filename, "rb"); + if (file == NULL) { + fprintf(stderr, "%s cannot open file (dist) %s \n", progname, filename); + exit(EXIT_FAILURE); + } + while ((currentCharacter = fgetc(file)) != EOF) { + dist[currentCharacter]++; + filesize++; + } + fclose(file); + + return filesize; +} + +/*void distribution_show(const unsigned long *dist) +{ + int i; + + for (i = 0; i <= UCHAR_MAX; i++) + { + if (dist[i] > 0) + { + if (iscntrl(i)) + printf("\\%d: %lu\n",i,dist[i]); + else + printf("%c: %lu\n",(char) i,dist[i]); + } + } +}*/ + +const unsigned long *binarysearch(const unsigned long *leftptr, + const unsigned long *rightptr, + unsigned long key) +{ + const unsigned long *midptr; + while (leftptr <= rightptr) + { + midptr = leftptr + (unsigned long) (rightptr-leftptr)/2; + if (key < *midptr) + { + rightptr = midptr - 1; + } else + { + if (key > *midptr) + { + leftptr = midptr + 1; + } else + if (key == *midptr) + { + while (*(midptr - 1) == key) { + midptr--; + } + return midptr; + } + } + } + if (key < *midptr) + return midptr; + return NULL; +} + +int main (void) { + + const unsigned long array1[] = {10, 13, 15, 15, 17, 18, 20, 22, 25, 27}; + const unsigned long array2[] = {10, 13, 15, 15, 17, 18, 18, 18, 20, 22}; + const unsigned long array3[] = {20, 22, 25, 27}; + + printf("Adresse: %lu\n", (unsigned long) &array2[5]); + + const unsigned long *element1 = binarysearch(&array1[0], &array1[9], 18); + const unsigned long *element2 = binarysearch(&array2[0], &array2[9], 18); + const unsigned long *element3 = binarysearch(&array3[0], &array3[3], 18); + + printf("element array 1: %lu\n", *element1); + printf("element array 2: %lu\nAdresse: %lu\n", *element2, (unsigned long) element2); + printf("element array 3: %lu\n", *element3); + + + return EXIT_SUCCESS; +} + + diff --git a/src/distribution.d b/src/distribution.d new file mode 100644 index 0000000000000000000000000000000000000000..981e9c402fdde94dcea55458e4ff17b02a5c6133 --- /dev/null +++ b/src/distribution.d @@ -0,0 +1,3 @@ +distribution.o: distribution.c distribution.h + +distribution.h: diff --git a/src/distribution.h b/src/distribution.h new file mode 100644 index 0000000000000000000000000000000000000000..75854c81a4f92df30b1dabd451446a9dd43492cd --- /dev/null +++ b/src/distribution.h @@ -0,0 +1,9 @@ +#ifndef MY_PFN_PROJEKT_2020_DISTRIBUTION_H +#define MY_PFN_PROJEKT_2020_DISTRIBUTION_H + +unsigned long distribution_update(unsigned long *dist, const char *progname, + const char *filename); + +/*void distribution_show(const unsigned long *dist);*/ + +#endif //MY_PFN_PROJEKT_2020_DISTRIBUTION_H \ No newline at end of file diff --git a/src/distribution.o b/src/distribution.o new file mode 100644 index 0000000000000000000000000000000000000000..8c02d0fc84c32dbe41ddfa6d61e63a5dfb100e88 Binary files /dev/null and b/src/distribution.o differ diff --git a/src/huffman-main.c b/src/huffman-main.c new file mode 100644 index 0000000000000000000000000000000000000000..c013d16527cca862986fcbf9fa156979896d2396 --- /dev/null +++ b/src/huffman-main.c @@ -0,0 +1,168 @@ +#include <stdio.h> +#include <limits.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <assert.h> +#include <unistd.h> +#include "distribution.h" +#include "huffman.h" +#include "codebook.h" +#include "bitbuffer.h" + +typedef struct { + bool decode; + char *filename; + char *progname; +} Options; + +static void usage(const char *progname, bool showOptions) { + fprintf(stderr, "Usage %s [options] <filename>\n", progname); + if (showOptions) { + fprintf(stderr, + "Encode the given file.\n" + "-d <filename.hf>\tDecode the given file.\n" + "-h\tshow this usage message.\n"); + } else { + fprintf(stderr, "Usage -h for more information.\n"); + } +} + +static Options *option_new(int argc, char *const *argv) { + bool error = false; + + if (argc < 2 || argc > 3) { + error = true; + } + Options *options = malloc(sizeof(*options)); + options->progname = argv[0]; + options->filename = argv[1]; + options->decode = false; + + int opt; + while ((opt = getopt(argc, argv, "dh?")) != -1) { + switch ((char) opt) { + default: + assert((char) opt == '?'); + break; + case 'd': + options->decode = true; + options->filename = argv[2]; + break; + case 'h': + usage(argv[0], true); + free(options); + return NULL; + break; + } + } + if (error) { + free(options); + usage(argv[0], false); + return NULL; + } + return options; +} + +char *create_filename(const char *file) { + const char *suffix = ".hf"; + const size_t buffer_length = strlen(file) + strlen(suffix) + 1; + char *buffer = malloc(buffer_length); + int n = sprintf(buffer, "%s%s", file, suffix); + + assert(n <= buffer_length); + return buffer; +} + +bool encode(const char *progname, const char *file1) { + bool error = false; + unsigned long dist[UCHAR_MAX + 1] = {0}; + unsigned long fileSize = distribution_update(dist, progname, file1); + Huffman_encoding *h_enc; + + h_enc = huffman_encoding_new(dist, UCHAR_MAX + 1); + Codebook_entry *codebook = codebook_new(h_enc); + + huffman_encoding_delete(h_enc); + + // Output Datei erstellen + char *filename = create_filename(file1); + FILE *outfp = fopen(filename, "wb"); + if (outfp == NULL) { + fprintf(stderr, "%s cannot create file %s \n", progname, filename); + error = true; + } + + GtBitbuffer *bitbuffer = gt_bitbuffer_new(outfp); + + gt_bitbuffer_next_value(bitbuffer, fileSize, 64); + + for (int i = 0; i < (UCHAR_MAX + 1); i++) { + gt_bitbuffer_next_value(bitbuffer, dist[i], 64); + } + + // Quelldatei nochmal einlesen + FILE *infp = fopen(file1, "rb"); + if (infp == NULL) { + fprintf(stderr, "%s cannot open file %s \n", progname, file1); + error = true; + } + + int currentCharacter; + while ((currentCharacter = fgetc(infp)) != EOF) { + gt_bitbuffer_next_value(bitbuffer, codebook[currentCharacter].code, + codebook[currentCharacter].length); + } + + codebook_delete(codebook); + gt_bitbuffer_delete(bitbuffer); + free(filename); + + fclose(infp); + fclose(outfp); + return error; +} + +bool decode(const char *progname, const char *file1) { + bool error = false; + FILE *file = fopen(file1, "rb"); + if (file == NULL) { + fprintf(stderr, "%s cannot open file %s \n", progname, file1); + error = true; + } + + GtBitbuffer *bitbuffer = gt_bitbuffer_new_get(file); + unsigned long filesize = gt_bitbuffer_next_value_get(bitbuffer, 64); + unsigned long dist[UCHAR_MAX + 1] = {0};; + + for (int i = 0; i < (UCHAR_MAX + 1); i++) { + dist[i] = gt_bitbuffer_next_value_get(bitbuffer, 64); + } + + Huffman_encoding *h_enc = huffman_encoding_new(dist, UCHAR_MAX + 1); + + if (filesize > 0) + huffman_encoding_decode(file, filesize, 0, h_enc); + + free(bitbuffer); + huffman_encoding_delete(h_enc); + + fclose(file); + return error; +} + +int main(int argc, char *argv[]) { + Options *options = option_new(argc, (char *const *) argv); + bool error = false; + + if (options == NULL) + return EXIT_FAILURE; + + if (options->decode == false) + error = encode(options->progname, options->filename); + else + error = decode(options->progname, options->filename); + + free(options); + return error ? EXIT_FAILURE : EXIT_SUCCESS; +} diff --git a/src/huffman-main.d b/src/huffman-main.d new file mode 100644 index 0000000000000000000000000000000000000000..935688a9bad870355bf1573bcb19bbd73f896e80 --- /dev/null +++ b/src/huffman-main.d @@ -0,0 +1,10 @@ +huffman-main.o: huffman-main.c distribution.h huffman.h codebook.h \ + bitbuffer.h + +distribution.h: + +huffman.h: + +codebook.h: + +bitbuffer.h: diff --git a/src/huffman-main.o b/src/huffman-main.o new file mode 100644 index 0000000000000000000000000000000000000000..17b5f03dd53201cde3dbdb3d261b5f8f368234d1 Binary files /dev/null and b/src/huffman-main.o differ diff --git a/src/huffman.c b/src/huffman.c new file mode 100644 index 0000000000000000000000000000000000000000..1f367fc409a40f6833f21f5a594fc3d0ddc90708 --- /dev/null +++ b/src/huffman.c @@ -0,0 +1,224 @@ +#include <stdlib.h> +#include <assert.h> +#include <string.h> +#include "huffman.h" +#include "priorityqueue.h" +#include "bitbuffer.h" + +struct Node { + struct Node *left; + struct Node *right; + unsigned char character; + unsigned long count; +}; + +struct Huffman_encoding { + struct Node *root; +}; + +static int compareOccurrences(const void *node1, const void *node2) { + const Node *a = (const Node *) node1; + const Node *b = (const Node *) node2; + + if (a->count < b->count) { + return -1; + } + if (a->count > b->count) { + return 1; + } + if (a->character < b->character) { + return -1; + } + if (a->character > b->character) { + return 1; + } + + return 0; +} + +Huffman_encoding *huffman_encoding_new(const unsigned long *dist, + unsigned long numsymbols) { + GtPriorityQueue *queue = gt_priority_queue_new(compareOccurrences, + numsymbols, 50); + for (unsigned long i = 0; i < numsymbols; i++) { + if (dist[i] > 0) { + Node *z = malloc(sizeof(*z)); + z->left = NULL; + z->right = NULL; + z->character = i; + z->count = dist[i]; + gt_priority_queue_add(queue, z); + } + } + while (gt_priority_queue_size(queue) >= 2) { + Node *x = gt_priority_queue_extract_max_prio(queue); + Node *y = gt_priority_queue_extract_max_prio(queue); + Node *z = malloc(sizeof(*z)); + z->left = y; + z->right = x; + z->character = x->character < y->character ? y->character + : x->character; + z->count = x->count + y->count; + gt_priority_queue_add(queue, z); + } + + Huffman_encoding *h_enc = malloc(sizeof(*h_enc)); + + if (gt_priority_queue_is_empty(queue)) { + h_enc->root = malloc(sizeof(*h_enc->root)); + h_enc->root->left = NULL; + h_enc->root->right = NULL; + h_enc->root->character = 0; + h_enc->root->count = 0; + } else { + h_enc->root = gt_priority_queue_extract_max_prio(queue); + } + + gt_priority_queue_delete(queue); + + return h_enc; +} + +void huffman_node_traversal(unsigned int length, + unsigned int code, + Node *currentNode, + Huffman_process_code process_code, + void *data) { + + if (currentNode->left != NULL) + huffman_node_traversal(length + 1, + (code << 1), + currentNode->left, + process_code, + data); + + if (currentNode->right != NULL) + huffman_node_traversal(length + 1, + ((code << 1) | 1), + currentNode->right, + process_code, + data); + + if (currentNode->left == NULL && currentNode->right == NULL) + process_code(length, + code, + currentNode->character, + currentNode->count, + data); +} + +void huffman_encoding_traversal(const Huffman_encoding *h_enc, + Huffman_process_code process_code, + void *data) { + unsigned int length = 0; + unsigned int code = 0; + + huffman_node_traversal(length, code, h_enc->root, process_code, data); +} + +unsigned int huffman_node_size(Node *currentNode, unsigned long length) { + + unsigned int size = 0; + + if (currentNode->left == NULL && currentNode->right == NULL) { + return (currentNode->count * length); + } + + if (currentNode->left != NULL) + size += huffman_node_size(currentNode->left, length + 1); + + if (currentNode->right != NULL) + size += huffman_node_size(currentNode->right, length + 1); + + return size; +} + +uint64_t huffman_encoding_size(const Huffman_encoding *h_enc) { + + uint64_t encoding_size; + unsigned long length = 0; + + encoding_size = huffman_node_size(h_enc->root, length); + + return encoding_size; +} + +unsigned int huffman_node_count(Node *currentNode) { + + unsigned int count = 0; + + if (currentNode->left == NULL && currentNode->right == NULL) { + return 1; + } + + if (currentNode->left != NULL) + count += huffman_node_count(currentNode->left); + + if (currentNode->right != NULL) + count += huffman_node_count(currentNode->right); + + return count; +} + +/* return the number of internal nodes in the huffman tree */ +unsigned long huffman_encoding_internalnodes(const Huffman_encoding *h_enc) { + return huffman_node_count(h_enc->root); +} + +/* perform decoding step: read encoding of file of length <totallength> + from <infp> and decode the symbols using the given huffman encoding + <h_enc>. If <symbolsinDNA> is 0, then the the decoded symbol is output. + For encoding FastQ-sequences, symbolsinDNA defines the number of + symbols in the DNA sequence. Output goes to standard out. */ +void huffman_encoding_decode(FILE *infp, + uint64_t totallength, + __attribute__((unused)) unsigned long symbolsinDNA, + const Huffman_encoding *h_enc) { + + GtBitbuffer *bitbuffer = gt_bitbuffer_new_get(infp); + Node *currentNode = h_enc->root; + unsigned long countnodes = huffman_encoding_internalnodes(h_enc); + + if (countnodes == 1) { + assert(h_enc->root->left == NULL); + for (unsigned long i = 0; i < totallength; i++) { + printf("%c", (char) currentNode->character); + } + } else { + unsigned long i = 0; + while (i < totallength) { + unsigned long currentBit = gt_bitbuffer_next_value_get(bitbuffer, + 1); + + if (currentBit == 1) + currentNode = currentNode->right; + else { + assert(currentBit == 0); + currentNode = currentNode->left; + } + + if (currentNode->left == NULL && currentNode->right == NULL) { + printf("%c", (char) currentNode->character); + currentNode = h_enc->root; + i++; + } + } + } + free(bitbuffer); +} + +static void huffman_node_delete(Node *node) { + if (node == NULL) + return; + if (node->left != NULL) + huffman_node_delete(node->left); + if (node->right != NULL) + huffman_node_delete(node->right); + free(node); +} + +void huffman_encoding_delete(Huffman_encoding *h_enc) { + assert(h_enc != NULL); + huffman_node_delete(h_enc->root); + free(h_enc); +} diff --git a/src/huffman.d b/src/huffman.d new file mode 100644 index 0000000000000000000000000000000000000000..4517467d3e4257c5bb9d51e47cdc7149f2a477a7 --- /dev/null +++ b/src/huffman.d @@ -0,0 +1,7 @@ +huffman.o: huffman.c huffman.h priorityqueue.h bitbuffer.h + +huffman.h: + +priorityqueue.h: + +bitbuffer.h: diff --git a/src/huffman.h b/src/huffman.h new file mode 100644 index 0000000000000000000000000000000000000000..a65156c9ffc5cdcfcb6a44f129f516c3f8b6a9fb --- /dev/null +++ b/src/huffman.h @@ -0,0 +1,74 @@ +/* + Copyright (c) 2011 Dirk Willrodt <willrodt@zbh.uni-hamburg.de> + Copyright (c) 2013 Stefan Kurtz <kurtz@zbh.uni-hamburg.de> + Copyright (c) 2013 Center for Bioinformatics, University of Hamburg +*/ + +#ifndef HUFFMAN_H +#define HUFFMAN_H +#include <inttypes.h> +#include <stdio.h> + +/* this type defines a struct which holds all necessary data to represent a + huffman encoding of a text, represented by the distribution of its + symbols. */ +typedef struct Huffman_encoding Huffman_encoding; + +/* gets an array of counts of length <numsymbols>. The i-th value in + <distribution> is the number of occurrences of the i-th ASCII-Symbol in + the given file. Return a representation of the Huffman-encoding.*/ +Huffman_encoding *huffman_encoding_new(const unsigned long *distribution, + unsigned long numsymbols); + +/* the destructor */ +void huffman_encoding_delete(Huffman_encoding *h_enc); + +/* return size of encoding in bits */ +uint64_t huffman_encoding_size(const Huffman_encoding *h_enc); + +/* this is the type for the function to process a code of given length for + the symbol with the given symbolnum and count-value. data refers + to data which may be required for the processing */ +typedef void (*Huffman_process_code)(unsigned int length, + unsigned int code, + unsigned int symbolnum, + unsigned long count, + void *data); + +/* perform a traversal of the Huffman-tree and apply <process_code> to + every node. <data> will be the last argument of any call to + <process_code>. */ +void huffman_encoding_traversal(const Huffman_encoding *h_enc, + Huffman_process_code process_code, + void *data); + +/* perform decoding step: read encoding of file of length <totallength> + from <infp> and decode the symbols using the given huffman encoding + <h_enc>. If <symbolsinDNA> is 0, then the the decoded symbol is output. + For encoding FastQ-sequences, symbolsinDNA defines the number of + symbols in the DNA sequence. Output goes to standard out. */ +void huffman_encoding_decode(FILE *infp, + uint64_t totallength, + unsigned long symbolsinDNA, + const Huffman_encoding *h_enc); + +void huffman_encoding_fast_decode(FILE *infp, + uint64_t totallength, + int widthbits, + const Huffman_encoding *h_enc); + +typedef struct +{ + unsigned int length, symbolnum; +} Codespec; + +Huffman_encoding *huffman_encoding_multi_insert(const Codespec *codespecs, + const unsigned int *codetab, + unsigned int numofsymbols); + +/* return the number of internal nodes in the huffman tree */ +unsigned long huffman_encoding_internalnodes(const Huffman_encoding *h_enc); + +typedef struct Node Node; + +#endif diff --git a/src/huffman.o b/src/huffman.o new file mode 100644 index 0000000000000000000000000000000000000000..ff683e3d44087131a80f6853907aa528b2ed98d7 Binary files /dev/null and b/src/huffman.o differ diff --git a/src/huffman.x b/src/huffman.x new file mode 100755 index 0000000000000000000000000000000000000000..430c4c679cdf204f684a0345beed12b3988e1309 Binary files /dev/null and b/src/huffman.x differ diff --git a/src/priorityqueue.c b/src/priorityqueue.c new file mode 100644 index 0000000000000000000000000000000000000000..d0eefa4521fb0cc10e845d827ee2020e5e73102e --- /dev/null +++ b/src/priorityqueue.c @@ -0,0 +1,166 @@ +/* + Copyright (c) 2013 Stefan Kurtz <kurtz@zbh.uni-hamburg.de> + Copyright (c) 2013 Center for Bioinformatics, University of Hamburg +*/ + +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <limits.h> +#include <assert.h> +#include "priorityqueue.h" + +#define HEAP_PARENT(X) ((X) >> 1) +#define HEAP_LEFT(X) ((X) << 1) + +struct GtPriorityQueue +{ + GtCompare cmpfun; + unsigned long capacity, numofelements; + bool array_based; + void **elements; +}; + +GtPriorityQueue *gt_priority_queue_new(GtCompare cmpfun, + unsigned long maxnumofelements, + unsigned long min_heap_size) +{ + GtPriorityQueue *pq = malloc(sizeof *pq); + + assert(pq != NULL); + pq->elements = malloc(sizeof (*pq->elements) * (maxnumofelements + 1)); + assert(pq->elements != NULL); + pq->cmpfun = cmpfun; + pq->capacity = maxnumofelements; + pq->array_based = maxnumofelements < min_heap_size ? true : false; + pq->numofelements = 0; + pq->elements[0] = NULL; + return pq; +} + +void gt_priority_queue_reset(GtPriorityQueue *pq) +{ + assert(pq != NULL); + pq->numofelements = 0; + pq->elements[0] = NULL; +} + +bool gt_priority_queue_is_empty(const GtPriorityQueue *pq) +{ + assert(pq != NULL); + return pq->numofelements == 0 ? true : false; +} + +bool gt_priority_queue_is_full(const GtPriorityQueue *pq) +{ + assert(pq != NULL); + return pq->numofelements == pq->capacity ? true : false; +} + +unsigned long gt_priority_queue_size(const GtPriorityQueue *pq) +{ + return pq->numofelements; +} + +void gt_priority_queue_add(GtPriorityQueue *pq, void *value) +{ + assert(pq != NULL && !gt_priority_queue_is_full(pq)); + if (pq->array_based) + { + void **ptr; + + /* store elements in reverse order, i.e.\ with the element of maximum + priority at the last index */ + /* move elements to the right until an element larger or equal than + the key is found. */ + for (ptr = pq->elements + pq->numofelements; ptr > pq->elements; ptr--) + { + if (pq->cmpfun(*(ptr-1),value) < 0) + { + *ptr = *(ptr-1); + } else + { + break; + } + } + *ptr = value; + pq->numofelements++; + } else + { + unsigned long idx = ++pq->numofelements; + + while (true) + { + const unsigned long parent = HEAP_PARENT(idx); + + if (parent == 0 || pq->cmpfun(pq->elements[parent],value) <= 0) + { + break; + } + assert(idx > 0); + //printf("o: %lu <- %lu\n",idx,parent); + pq->elements[idx] = pq->elements[parent]; + idx = parent; + } + assert(idx > 0); + //printf("o: store elem at %lu\n",idx); + pq->elements[idx] = value; + } +} + +void *gt_priority_queue_extract_max_prio(GtPriorityQueue *pq) +{ + void *max_prio_element; + assert(pq != NULL && !gt_priority_queue_is_empty(pq)); + + if (pq->array_based) + { + assert(pq->numofelements > 0); + max_prio_element = pq->elements[--pq->numofelements]; + } else + { + unsigned long idx, child; + void *lastelement; + + max_prio_element = pq->elements[1]; + lastelement = pq->elements[pq->numofelements--]; + for (idx = 1UL; HEAP_LEFT(idx) <= pq->numofelements; idx = child) + { + child = HEAP_LEFT(idx); + assert(child > 0); + if (child != pq->numofelements && + pq->cmpfun(pq->elements[child + 1],pq->elements[child]) < 0) + { + child++; + } + if (pq->cmpfun(lastelement,pq->elements[child]) > 0) + { + //printf("maxprio, o: %lu <= %lu\n",idx,child); + pq->elements[idx] = pq->elements[child]; + } else + { + break; + } + } + assert(idx > 0); + //printf("maxprio, o: queue[%lu] <- lastelemen\n",idx); + pq->elements[idx] = lastelement; + } + return max_prio_element; +} + +const void *gt_priority_queue_find_max_prio(const GtPriorityQueue *pq) +{ + assert(pq != NULL && !gt_priority_queue_is_empty(pq)); + return pq->elements + (pq->array_based ? pq->numofelements-1 + : 1UL); +} + +void gt_priority_queue_delete(GtPriorityQueue *pq) +{ + if (pq != NULL) + { + free(pq->elements); + free(pq); + } +} diff --git a/src/priorityqueue.d b/src/priorityqueue.d new file mode 100644 index 0000000000000000000000000000000000000000..23c45c1d9e3b65d2b9feeb568cc8484f5b32f20c --- /dev/null +++ b/src/priorityqueue.d @@ -0,0 +1,3 @@ +priorityqueue.o: priorityqueue.c priorityqueue.h + +priorityqueue.h: diff --git a/src/priorityqueue.h b/src/priorityqueue.h new file mode 100644 index 0000000000000000000000000000000000000000..f63ae6685f80c9c2815e971b87ec7fe3e90c115d --- /dev/null +++ b/src/priorityqueue.h @@ -0,0 +1,64 @@ +/* + Copyright (c) 2013 Stefan Kurtz <kurtz@zbh.uni-hamburg.de> + Copyright (c) 2013 Center for Bioinformatics, University of Hamburg +*/ + +#ifndef PRIORITYQUEUE_H +#define PRIORITYQUEUE_H +#include <stdbool.h> + +/*lst{Priorityqueuenterface}*/ + +/* The type of the compare function. It is like the compare function + used in qsort */ +typedef int (*GtCompare)(const void *,const void *); + +/* The typename of the PriorityQueue */ +typedef struct GtPriorityQueue GtPriorityQueue; + +/* Create a new empty priority queue for at most maxnumofelements. Order + the elements according to the comparison function cmpfun, which should + return -1/1/0 if the first argument has less/more/the same priority + as the second argument. + There are two implementations of priority queues available: + (1) One based on arrays with O(n) time for the add operation and O(1) time + for the find_max_prio and extract_max_prio- operations, where n + is the number of elements in the queue. This should be used if + maxnumofelements is small. + (2) One based on a heap with O(\log n) running time for the add, + and the extract_max_prio-operations and O(1) time for the + find_max_prio-operations. + if maxnumofelements < min_heap_size, then the array based implementation + is used, otherwise the heap based implemention +*/ +GtPriorityQueue *gt_priority_queue_new(GtCompare cmpfun, + unsigned long maxnumofelements, + unsigned long min_heap_size); + +/* reset the Queue, so that it becomes empty */ +void gt_priority_queue_reset(GtPriorityQueue *pq); + +/* Add element referred to by value to the queue. The queue only stores + the reference and the user is responsible for allocating space for + that element. */ +void gt_priority_queue_add(GtPriorityQueue *pq, void *value); + +/* return the number of elements stored in the queue */ +unsigned long gt_priority_queue_size(const GtPriorityQueue *pq); + +/* return reference to element in queue which has maximum priority. Delete + the element from the queue. The space referred to can be reused. */ +void *gt_priority_queue_extract_max_prio(GtPriorityQueue *pq); +/* return reference to element in queue which has maximum priority. Keep + the reference in the queue. The space referred to cannot be reused. */ +const void *gt_priority_queue_find_max_prio(const GtPriorityQueue *pq); +/* return true iff Queue is empty, i.e. contains no elements */ +bool gt_priority_queue_is_empty(const GtPriorityQueue *pq); +/* return true iff Queue is full, i.e. has achieved its maximumum capacity + as specified by the parameter maxnumofelements of the _new function. */ +bool gt_priority_queue_is_full(const GtPriorityQueue *pq); +/* Delete the space for the priority queue */ +void gt_priority_queue_delete(GtPriorityQueue *pq); +/*lstend*/ + +#endif diff --git a/src/priorityqueue.o b/src/priorityqueue.o new file mode 100644 index 0000000000000000000000000000000000000000..d125bed36f2812fb0973f9f869bfd2d8102a6871 Binary files /dev/null and b/src/priorityqueue.o differ diff --git a/src/run_calc_compression.sh b/src/run_calc_compression.sh new file mode 100755 index 0000000000000000000000000000000000000000..88663dbb496a7993df88e808099fb905cf42e983 --- /dev/null +++ b/src/run_calc_compression.sh @@ -0,0 +1,17 @@ +#!/bin/sh + +set -e -x + +# uncomment the following line if you want to debug your program +# and valgrind as well as valgrind.sh is available +#VALGRIND=valgrind.sh +for filename in `ls *.[cox]` +do + ${VALGRIND} ./calc_compression.x ${filename} +done +filelist="../data/README ../data/a29.txt ../data/emptyfile ../data/shaks.data" +for filename in ${filelist} `ls ../data/Canterburycorpus/*` +do + base=`basename ${filename}` + ${VALGRIND} ./calc_compression.x ${filename} | diff - ../data/results_ccp/${base}.ccp +done diff --git a/src/run_huffman.sh b/src/run_huffman.sh new file mode 100755 index 0000000000000000000000000000000000000000..be514c8267844b29e3b8edb0b3d6739cab217d0c --- /dev/null +++ b/src/run_huffman.sh @@ -0,0 +1,14 @@ +#!/bin/sh + +set -e -x + +# uncomment the following line if you want to debug your program +# and valgrind as well as valgrind.sh is available +VALGRIND=valgrind.sh +filelist="../data/README ../data/a29.txt ../data/emptyfile ../data/shaks.data" +for filename in ${filelist} `ls ../data/Canterburycorpus/*` +do + ${VALGRIND} ./huffman.x ${filename} + ${VALGRIND} ./huffman.x -d ${filename}.hf | diff - ${filename} + rm -f ${filename}.hf +done