Skip to content
Snippets Groups Projects
Commit 11bdfc58 authored by Jocelyne's avatar Jocelyne
Browse files

pfn2 c

parent d37ca5c2
No related branches found
No related tags found
No related merge requests found
Showing
with 813 additions and 0 deletions
No preview for this file type
.PHONY:test
CC?=clang
CFLAGS=-g -m64 -Wall -Werror -Wunused-parameter -Wunused-variable -O3
FILE_BASE=dna2aa
all:${FILE_BASE}.x
%.x:%.c
${CC} ${CFLAGS} -o $@ $<
test:${FILE_BASE}.x
@./compare_dna2aa.sh 4000000
@echo "Congratulations. $@ passed"
.PHONY:clean
clean:
@${RM} ${FILE_BASE}.[ox]
@${RM} -r __pycache__ ${FILE_BASE}.x.dSYM
import sys
genetic_code = {
'TCA' : 'S', # Serine
'TCC' : 'S', # Serine
'TCG' : 'S', # Serine
'TCT' : 'S', # Serine
'TTC' : 'F', # Phenylalanine
'TTT' : 'F', # Phenylalanine
'TTA' : 'L', # Leucine
'TTG' : 'L', # Leucine
'TAC' : 'Y', # Tyrosine
'TAT' : 'Y', # Tyrosine
'TAA' : '*', # Stop
'TAG' : '*', # Stop
'TGC' : 'C', # Cysteine
'TGT' : 'C', # Cysteine
'TGA' : '*', # Stop
'TGG' : 'W', # Tryptophan
'CTA' : 'L', # Leucine
'CTC' : 'L', # Leucine
'CTG' : 'L', # Leucine
'CTT' : 'L', # Leucine
'CCA' : 'P', # Proline
'CCC' : 'P', # Proline
'CCG' : 'P', # Proline
'CCT' : 'P', # Proline
'CAC' : 'H', # Histidine
'CAT' : 'H', # Histidine
'CAA' : 'Q', # Glutamine
'CAG' : 'Q', # Glutamine
'CGA' : 'R', # Arginine
'CGC' : 'R', # Arginine
'CGG' : 'R', # Arginine
'CGT' : 'R', # Arginine
'ATA' : 'I', # Isoleucine
'ATC' : 'I', # Isoleucine
'ATT' : 'I', # Isoleucine
'ATG' : 'M', # Methionine
'ACA' : 'T', # Threonine
'ACC' : 'T', # Threonine
'ACG' : 'T', # Threonine
'ACT' : 'T', # Threonine
'AAC' : 'N', # Asparagine
'AAT' : 'N', # Asparagine
'AAA' : 'K', # Lysine
'AAG' : 'K', # Lysine
'AGC' : 'S', # Serine
'AGT' : 'S', # Serine
'AGA' : 'R', # Arginine
'AGG' : 'R', # Arginine
'GTA' : 'V', # Valine
'GTC' : 'V', # Valine
'GTG' : 'V', # Valine
'GTT' : 'V', # Valine
'GCA' : 'A', # Alanine
'GCC' : 'A', # Alanine
'GCG' : 'A', # Alanine
'GCT' : 'A', # Alanine
'GAC' : 'D', # Aspartic Acid
'GAT' : 'D', # Aspartic Acid
'GAA' : 'E', # Glutamic Acid
'GAG' : 'E', # Glutamic Acid
'GGA' : 'G', # Glycine
'GGC' : 'G', # Glycine
'GGG' : 'G', # Glycine
'GGT' : 'G', # Glycine
}
def codon2aa(codon):
codon = codon.upper()
if codon in genetic_code:
return genetic_code[codon]
else:
sys.stderr.write('Bad codon "{}"\n'.format(codon))
exit(1)
#!/bin/sh
if test $# -ne 1
then
echo "Usage: $0 <sequence length>"
exit 1
fi
DNA_SEQUENCE_FILENAME=`mktemp /tmp/dna_seq.fnaXXXXXX` || exit 1
PROTEIN_SEQUENCE_FILENAME=`mktemp /tmp/protein_seq.fnaXXXXXX` || exit 1
length=$1
./random_sequence.py --dna --length ${length} > ${DNA_SEQUENCE_FILENAME}
time ./dna2aa.x ${DNA_SEQUENCE_FILENAME} > ${PROTEIN_SEQUENCE_FILENAME}
time ./dna2aa.py ${DNA_SEQUENCE_FILENAME} | diff - ${PROTEIN_SEQUENCE_FILENAME}
rm -f ${DNA_SEQUENCE_FILENAME} ${PROTEIN_SEQUENCE_FILENAME}
#include <stdbool.h>
#include <assert.h>
#include <stdlib.h>
#include <limits.h>
#include <stdio.h>
#include <inttypes.h>
static char *read_fasta_format(unsigned long *length,FILE *fp)
{
#define MAXLINE 100
char *sequence = NULL, line[MAXLINE+1];
unsigned long allocated = 0, nextfree = 0;
while (fgets(line,MAXLINE,fp) != NULL)
{
if (*line != '>')
{
char *ptr;
for (ptr = line; *ptr != '\n'; ptr++)
{
if (nextfree >= allocated)
{
allocated = allocated * 1.2 + 4;
sequence = realloc(sequence,allocated * sizeof *sequence);
assert(sequence != NULL);
}
assert(nextfree < allocated);
sequence[nextfree++] = *ptr;
}
}
}
*length = nextfree;
return sequence;
}
static void print_fasta_format(const char *sequence,unsigned long length)
{
unsigned long idx, lineoffset = 0;
for (idx = 0; idx < length; idx++)
{
if (lineoffset >= 70)
{
printf("\n");
lineoffset = 0;
}
putchar(sequence[idx]);
lineoffset++;
}
printf("\n");
}
typedef struct
{
const char *name; /* the name of the translation */
unsigned int identity; /* the identity number */
const char *aminos, /* array of amino acids in order of codons,
star stands for stop codon */
*startcodons; /* the start codons, represented by Ms in string
filled with - */
uint8_t map[UCHAR_MAX+1]; /* map of bases to numbers in range 0..3,
sorted in alphabet order T < C < A < G */
} TranslationScheme;
/*lst{initcharmap}*/
static void init_char_map(uint8_t *map)
{
int idx;
for (idx = 0; idx <= UINT8_MAX; idx++) {
map[idx] = UINT8_MAX;
}
map['T'] = 0;
map['t'] = 0;
map['U'] = 0;
map['u'] = 0;
map['C'] = 1;
map['c'] = 1;
map['A'] = 2;
map['a'] = 2;
map['G'] = 3;
map['g'] = 3;
}
/*lstend*/
/*lst{codontocode}*/
unsigned int codon2code(const uint8_t *map,const char *codon)
{
uint8_t cc_0 = map[(int) codon[0]],
cc_1 = map[(int) codon[1]],
cc_2 = map[(int) codon[2]];
assert(cc_0 < 4 && cc_1 < 4 && cc_2 < 4);
return 16 * cc_0 + 4 * cc_1 + cc_2;
}
/*lstend*/
/* map T0->A2, C1->G3, A2->T0, G3->C1 */
/*lst{codontocodecomplement}*/
uint8_t wcc_map(uint8_t cc) { return (cc + 2) % 4; }
unsigned int codon2code_rc(const uint8_t *map,
const char *codon) {
uint8_t cc_0 = map[(int) codon[2]], /* reverse order */
cc_1 = map[(int) codon[1]],
cc_2 = map[(int) codon[0]];
cc_0 = wcc_map(cc_0);
cc_1 = wcc_map(cc_1);
cc_2 = wcc_map(cc_2);
return 16 * cc_0 + 4 * cc_1 + cc_2;
}
/*lstend*/
unsigned int codon2code_stepwise(const uint8_t *map,bool forward,
const char *codon)
{
int idx;
unsigned int code = 0;
for (idx = 0; idx < 3; idx++)
{
uint8_t m_cc, cc = codon[forward ? idx : 2 - idx];
m_cc = map[(int) cc];
if (m_cc == (uint8_t) UINT8_MAX)
{
fprintf(stderr,"Illegal character at pos %d in codon\n",idx);
exit(EXIT_FAILURE);
}
if (!forward)
{
m_cc = wcc_map(m_cc);
}
assert(m_cc < 4);
code = (code * 4) + m_cc;
}
if (code >= 64)
{
fprintf(stderr,"code=%u,codon=%3.3s\n",code,codon);
exit(EXIT_FAILURE);
}
return code;
}
const TranslationScheme *translationscheme_new(void)
{
static TranslationScheme schemetable[] = {
{"Standard",
(unsigned int) 1,
"FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
"---M---------------M---------------M----------------------------"}
};
init_char_map(schemetable[0].map);
return schemetable;
}
char *dna2peptide(unsigned long *peptide_length,const char *dnaseq,
unsigned long len,bool forward)
{
if (len < 3)
{
*peptide_length = 0;
return NULL;
} else
{
const char *codon;
const TranslationScheme *scheme = translationscheme_new();
char *peptide, *peptide_ptr;
peptide_ptr = peptide = malloc(len/3 * sizeof *peptide);
assert(peptide != NULL);
if (forward)
{
for (codon = dnaseq; codon < dnaseq + len - 2; codon += 3)
{
unsigned int code = codon2code(scheme->map,codon);
*peptide_ptr++ = scheme->aminos[code];
}
} else
{
for (codon = dnaseq + len - 3; codon >= dnaseq; codon -= 3)
{
unsigned int code = codon2code_rc(scheme->map,codon);
*peptide_ptr++ = scheme->aminos[code];
}
}
*peptide_length = (unsigned long) (peptide_ptr - peptide);
return peptide;
}
}
int main(int argc,char *argv[])
{
FILE *fp;
char *dnaseq;
int idx;
unsigned long length;
if (argc != 2)
{
fprintf(stderr,"Usage: %s <inputfile>\n",argv[0]);
return EXIT_FAILURE;
}
const char *inputfile = argv[1];
fp = fopen(inputfile,"rb");
if (fp == NULL)
{
fprintf(stderr,"%s: cannot open file %s\n",argv[0],inputfile);
return EXIT_FAILURE;
}
dnaseq = read_fasta_format(&length,fp);
fclose(fp);
fprintf(stderr,"read sequence of length %lu\n",length);
for (idx = 0; idx < 2; idx++)
{
char *peptide;
unsigned long peptide_length;
bool forward = (idx == 0) ? true : false;
peptide = dna2peptide(&peptide_length,dnaseq,length,forward);
fprintf(stderr,"%s: translate DNA sequence of length %lu from file %s "
"to protein sequence of length %lu, forward=%s\n",
argv[0],
length,
inputfile,
peptide_length,
forward ? "true" : "false");
printf(">%stranslation of sequence from %s\n",
forward ? "" : "reverse complement ",inputfile);
print_fasta_format(peptide,peptide_length);
free(peptide);
}
free(dnaseq);
return EXIT_SUCCESS;
}
#!/usr/bin/env python
import sys, re, argparse
from codon2aa import codon2aa
from reverse_complement import reverse_complement
from print_sequence import print_sequence
from fastaIterator import fasta_next
def cds2protein(seq):
aa_list = list()
for codon in re.findall(r'[ACGT]{3}',seq,flags=re.I):
aa = codon2aa(codon)
aa_list.append(aa)
return ''.join(aa_list)
def parse_arguments():
p = argparse.ArgumentParser(description=('transform coding sequence into '
'protein'))
p.add_argument('-w','--width',type=int,default=70,
help='specify width of line in fasta formatted output')
p.add_argument('inputfile',type=str,
help='specify input file with coding sequence')
return p.parse_args()
def msg(length,inputfile,plength,forward):
sys.stderr.write(('{}: translate DNA sequence of length {} from file {} '
'to protein sequence of length {}, forward={}\n')
.format(sys.argv[0],length,inputfile,plength,forward))
args = parse_arguments()
try:
stream = open(args.inputfile)
except IOError as err:
sys.stderr.write('{}: {}\n'.format(sys.argv[0],err))
exit(1)
for _, sequence in fasta_next(args.inputfile):
protein = cds2protein(sequence)
msg(len(sequence),args.inputfile,len(protein),'true')
print('>translation of sequence from {}'.format(args.inputfile))
print_sequence(protein,args.width)
sequence = reverse_complement(sequence)
protein = cds2protein(sequence)
msg(len(sequence),args.inputfile,len(protein),'false')
print('>reverse complement translation of sequence from {}'
.format(args.inputfile))
print_sequence(protein,args.width)
File added
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple Computer//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>CFBundleDevelopmentRegion</key>
<string>English</string>
<key>CFBundleIdentifier</key>
<string>com.apple.xcode.dsym.dna2aa.x</string>
<key>CFBundleInfoDictionaryVersion</key>
<string>6.0</string>
<key>CFBundlePackageType</key>
<string>dSYM</string>
<key>CFBundleSignature</key>
<string>????</string>
<key>CFBundleShortVersionString</key>
<string>1.0</string>
<key>CFBundleVersion</key>
<string>1</string>
</dict>
</plist>
#!/usr/bin/env python3
import sys, re, argparse
def print_sequence(seq,linelength = 70):
for startpos in range(0,len(seq),linelength):
print(seq[startpos:startpos+linelength])
def seq_list2sequence(seq_list):
sequence = ' '.join(seq_list)
return re.sub('\s','',sequence)
def stream_get(filename):
if filename == '-':
return sys.stdin
try:
stream = open(filename,'r')
except IOError as err:
sys.stderr.write('{}: {}\n'
.format(sys.argv[0],err))
exit(1)
return stream
def fasta_next(filename):
stream = stream_get(filename)
seq_list = list()
header = None
for line in stream:
if not re.search(r'^(>|\s*$|\s*#)',line):
seq_list.append(line.rstrip())
else:
mo = re.search(r'^>(.*)',line)
if mo:
if header:
yield header, seq_list2sequence(seq_list)
seq_list.clear()
header = mo.group(1)
if header:
yield header, seq_list2sequence(seq_list)
if filename != '-':
stream.close()
def fastq_next(filename):
stream = stream_get(filename)
seq_list = list()
state = 0
header = None
for line in stream:
line = line.rstrip()
if state == 0:
assert len(line) > 0 and line[0] == '@'
header = line[1:]
state += 1
elif state == 1:
yield header, line
state += 1
elif state == 2:
state += 1
else:
state = 0
if filename != '-':
stream.close()
def parse_arguments():
p = argparse.ArgumentParser(description='parse fasta or fastq file')
p.add_argument('-w','--width',type=int,default=64,metavar='<int>',
help='specify width of lines in sequence output')
p.add_argument('inputfiles',nargs='+',
help='specify input files, - means stdin')
return p.parse_args()
if __name__ == '__main__':
args = parse_arguments()
for filename in args.inputfiles:
if re.search(r'\.fq$',filename) or re.search(r'\.fastq$',filename):
reader = fastq_next
else:
reader = fasta_next
for header, sequence in reader(filename):
print('>{}'.format(header))
print_sequence(sequence, args.width)
import sys
#lst{printSequence}
def print_sequence(seq,linelength,stream = sys.stdout):
for startpos in range(0,len(seq),linelength):
stream.write('{}\n'.format(seq[startpos:startpos+linelength]))
#lstend#
#!/usr/bin/env python3
import argparse
from random import randint
def parse_arguments():
p = argparse.ArgumentParser(description='generate random sequence')
seqtypeclass = p.add_mutually_exclusive_group(required=True)
seqtypeclass.add_argument('--dna',action='store_true',default=False,
help='generate DNA sequence')
seqtypeclass.add_argument('--protein',action='store_true',default=False,
help='generate protein sequence')
seqtypeclass.add_argument('--alphabet',type=str,default=None,
help='generate sequence over given alphabet')
p.add_argument('-w','--width',type=int,default=70,
help='specify width of line in fasta formatted output')
p.add_argument('-l','--length',type=int,default=1000,
help='specify length of sequence to generate')
return p.parse_args()
args = parse_arguments()
alphabet = None
if args.dna:
alphabet = 'ACGT'
elif args.protein:
alphabet = 'ACDEFGHIKLMNPQRSTVWY'
else:
alphabet = args.alphabet
print('>random sequence of length {} over alphabet {}'
.format(args.length,alphabet))
line = list()
alpha_size = len(alphabet)
for idx in range(0,args.length):
cc = alphabet[randint(0,alpha_size-1)]
line.append(cc)
if len(line) == args.width:
print(''.join(line))
line.clear()
if len(line) > 0:
print(''.join(line))
#!/usr/bin/env python3
#title Calculating the reverse complement of a DNA sequence
import string
#lst{ReverseFromReversed}
def reverse(seq):
return ''.join(reversed(seq))
#lstend#
#lst{ReverseComplement}
def reverse_complement(seq):
revcom = reverse(seq)
transtab = str.maketrans('ACGTacgt','TGCAtgca')
return revcom.translate(transtab)
#lstend#
if __name__ == '__main__':
s = 'agctatcagcagcat'
t = reverse_complement(s)
assert s == reverse_complement(t), \
's = {} != {} = RC({})\n'.format(s,reverse_complement(t),t)
# Zuerst tragen Sie bitte f"ur den Platzhalter
# h unten die Anzahl der Stunden, die sie f"ur die
# Aufgabe aufgewendet haben, ein. Das k"onnen auch Flie"spunkt-Werte sein,
# also z.B. 1.5, wenn Sie 1 Stunde und 30 Minuten ben"otigt haben.
Bearbeitungszeit: 0.5 Stunden
# Als n"achstes tragen Sie bitte ein, wie schwierig
# Sie die Aufgabe fanden. Die m"oglichen Werte sind vorgegeben. Bitte
# die Werte streichen, die nicht zutreffen:
Schwierigkeitsgrad: leicht
# Schliesslich tragen Sie bitte ein, wie verst"andlich die
# Aufgabenstellung formuliert war. Die m"oglichen Werte sind vorgegeben.
# Bitte die Werte streichen, die nicht zutreffen.
Aufgabenstellung: vollkommen klar
Frau Jakovljevic hat erklärt wie die DNA-Strings und Codons eingeteilt werden
und wie den Integer Codes umgewandelt werden kann. Danach hat sie erklärt wie
wir das (und auch das "reverse" Code) in einem C Programm implementieren
k"onnen. Wir haben auch besprochen wie man mapping in C machen kann, im Vergleich
zu Python.
# Falls eine der beiden letzten Kategorien zutrifft, bitte
# kurz angeben, worin die Unklarheiten bestehen.
1
3
4
7
9
10
13
15
19
21
22
27
28
31
39
40
43
45
46
55
57
58
63
64
67
79
81
82
85
87
91
93
94
1
3
4
7
9
10
13
15
19
21
22
27
28
31
39
40
43
45
46
55
57
58
63
64
67
79
81
82
85
87
91
93
94
111
115
117
118
121
127
129
130
135
136
139
159
163
165
166
171
172
175
183
187
189
190
193
202
223
231
235
237
238
243
244
247
255
256
259
261
262
271
273
274
279
280
283
319
327
331
333
334
343
345
346
351
352
355
364
367
375
379
381
382
387
388
391
405
406
409
418
447
463
471
475
477
478
487
489
490
495
496
499
511
513
514
517
519
523
525
526
543
547
549
550
559
561
562
567
568
571
580
607
639
655
663
667
669
670
687
691
693
694
703
705
706
711
712
715
729
730
733
735
742
751
759
763
765
766
769
775
777
778
783
784
787
811
813
814
819
820
823
837
838
841
850
895
927
943
951
955
957
958
975
979
981
982
991
993
994
999
1000
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment