****** Week1 - Tuesday ****** @@ Assignment 1 @@ ======= week1_tuesday_1.py ======= #!/usr/bin/env python import sys print float(sys.argv[1]) + float(sys.argv[2]) ================================== @@ Assignment 2 @@ ======= week1_tuesday_2.py ======= #!/usr/bin/env python import sys if len(sys.argv) != 3: sys.exit("Please provide two numeric arguments") else: print float(sys.argv[1]) + float(sys.argv[2]) ================================== an alternate version which uses exceptions is ======= week1_tuesday_2b.py ======= #!/usr/bin/env python import sys if len(sys.argv) != 3: sys.exit("Please provide two arguments") try: print float(sys.argv[1]) + float(sys.argv[2]) except ValueError: sys.exit("Please provide two numeric arguments") ================================== @@ Assignment 3 @@ ======= week1_tuesday_3.py ======= #!/usr/bin/env python import datetime print "It is now", datetime.datetime.now() ================================== @@ Assignment 4 @@ ======= reversec.py ======= #!/usr/bin/env python import sys import string seq = sys.argv[1] print seq.translate(string.maketrans("ATCG", "TAGC"))[::-1] ================================== ****** Week1 - Wednesday ****** @@ Assignment 1 @@ ======= week1_wednesday_1.py ======= seq = raw_input("Enter a sequence: ") print "It is", len(seq), "bases long" ==================================== I don't need to use the variable name 'seq'. I could use any name I wanted, so the following also works. ======= week1_wednesday_1b.py ======= qqqqq = raw_input("Enter a sequence: ") print "It is", len(qqqqq), "bases long" ===================================== @@ Assignment 2 @@ ======= week1_wednesday_2.py ======= seq = raw_input("Enter a sequence: ") print "It is", len(seq), "bases long" print "adenine:", seq.count("A") print "thymine:", seq.count("T") print "cytosine:", seq.count("C") print "guanine:", seq.count("G") ==================================== @@ Assignment 3 @@ One solution is to convert the input sequence into uppercase. ======= week1_wednesday_3a.py ======= seq = raw_input("Enter a sequence: ") seq = seq.upper() print "It is", len(seq), "bases long" print "adenine:", seq.count("A") print "thymine:", seq.count("T") print "cytosine:", seq.count("C") print "guanine:", seq.count("G") ===================================== Another is to count both uppercase and lowercase letters. ======= week1_wednesday_3b.py ======= seq = raw_input("Enter a sequence: ") print "It is", len(seq), "bases long" print "adenine:", (seq.count("A") + seq.count("a")) print "thymine:", (seq.count("T") + seq.count("t")) print "cytosine:", (seq.count("C") + seq.count("c")) print "guanine:", (seq.count("G") + seq.count("g")) ===================================== @@ Assignment 4 @@ The known characters are A, T, C, and G. The total number of bases is the length of the sequence. So the number of unknown bases is the total length less the number of known bases. I use the parenthesis in the last print statement so I can make things line up pretty. ======= week1_wednesday_4.py ======= seq = raw_input("Enter a sequence: ") seq = seq.upper() print "It is", len(seq), "bases long" print "adenine:", seq.count("A") print "thymine:", seq.count("T") print "cytosine:", seq.count("C") print "guanine:", seq.count("G") print "unknown:", len(seq) - (seq.count("A") + seq.count("T") + seq.count("C") + seq.count("G") ) ===================================== ****** Week1 - Thursday ****** @@ Assignment 1 @@ ======= week1_thursday_1.py ======= seq = raw_input("Enter a sequence: ") for n in range(10): print n, seq =================================== @@ Assignment 2 @@ Here is one solution. I use the variable "base" for each letter in the sequence, but I could have used any variable name. ======= week1_thursday_2a.py ======= seq = raw_input("Enter a sequence: ") n = 0 for base in seq: print "base", n, "is", base n = n + 1 =================================== Here is the version which starts with 1 instead of 0 ======= week1_thursday_2a_from_1.py ======= seq = raw_input("Enter a sequence: ") n = 1 # here is the change for base in seq: print "base", n, "is", base n = n + 1 =========================================== Here is another way to solve this exercise. First, counting from 0. ======= week1_thursday_2b.py ======= seq = raw_input("Enter a sequence: ") for n in range(len(seq)): print "base", n, "is", seq[n] ==================================== And starting from 1. ======= week1_thursday_2b_from_1.py ======= seq = raw_input("Enter a sequence: ") for n in range(len(seq)): print "base", n+1, "is", seq[n] =========================================== @@ Assignment 3 @@ ======= week1_thursday_3.py ======= restriction_sites = [ "GAATTC", # EcoRI "GGATCC", # BamHI "AAGCTT", # HindIII ] for site in restriction_sites: print site, "is a restriction site" =================================== @@ Assignment 4 @@ ======= week1_thursday_4.py ======= restriction_sites = [ "GAATTC", # EcoRI "GGATCC", # BamHI "AAGCTT", # HindIII ] seq = raw_input("Enter a sequence: ") for site in restriction_sites: print site, "is in the sequence", site in seq =================================== ****** Week1 - Friday ****** @@ Assignment 1 @@ ======= week1_friday_1a.py ======= seq = raw_input("Enter a sequence: ") if "A" in seq: print "A count:", seq.count("A") if "T" in seq: print "T count:", seq.count("T") if "C" in seq: print "C count:", seq.count("C") if "G" in seq: print "G count:", seq.count("G") ================================= Here's another solution. In this one I save the count to a variable and test if that value is non-zero. This is slightly more efficient because Python only checks the string once. The "!=" is Python's way of saying "not equal to". Another way you could do the test is "if n:" because the value 0 is False and a non-zero value is True. ======= week1_friday_1b.py ======= seq = raw_input("Enter a sequence: ") n = seq.count("A") if n != 0: print "A count:", n n = seq.count("T") if n != 0: print "T count:", n n = seq.count("C") if n != 0: print "C count:", n n = seq.count("G") if n != 0: print "G count:", n ================================= Here is still another alternative, which use the for loop from a few days from now. ======= week1_friday_1c.py ======= seq = raw_input("Enter a sequence: ") for base in "ATCG": n = seq.count(base) if n != 0: print base, "count:", n ================================= @@ Assignment 2 @@ To solve this exercise, add the "else:" statement then fill in the False case. ======= week1_friday_2.py ======= seq = raw_input("Enter a sequence: ") if "A" in seq: print "A count:", seq.count("A") else: print "A not found" if "T" in seq: print "T count:", seq.count("T") else: print "T not found" if "C" in seq: print "C count:", seq.count("C") else: print "C not found" if "G" in seq: print "G count:", seq.count("G") else: print "G not found" =================================== @@ Assignment 3 @@ This solution is similar to week1_thursday_2a.py except this time it works on lines in a file instead of characters in a string. ======= week1_friday_3.py ========= n = 1 for line in open("/usr/coursehome/dalke/10_sequences.seq"): seq = line.rstrip() print n, seq n = n + 1 =================================== @@ Assignment 4 @@ ======= week1_friday_4a.py ======== for line in open("/usr/coursehome/dalke/10_sequences.seq"): seq = line.rstrip() if "CTATA" in seq: print seq =================================== The find method returns -1 when the string isn't found. ======= week1_friday_4b.py ======= for line in open("/usr/coursehome/dalke/10_sequences.seq"): seq = line.rstrip() i = seq.find("CTATA") if i != -1: print "CTATA at position", i, "in", seq =================================== @@ Assignment 5 @@ ======= week1_friday_5.py ======= num_sequences = 0 num_CTATA = 0 num_over_1000 = 0 num_high_GC = 0 num_over_2000_high_GC = 0 for line in open("/usr/coursehome/dalke/sequences.seq"): seq = line.rstrip() num_sequences = num_sequences + 1 if "CTATA" in seq: num_CTATA = num_CTATA + 1 if len(seq) > 1000: num_over_1000 = num_over_1000 + 1 GC_content = (seq.count("G") + seq.count("C")) / float(len(seq)) if GC_content > 0.5: num_high_GC = num_high_GC + 1 if len(seq) > 2000 and GC_content > 0.6: num_over_2000_high_GC = num_over_2000_high_GC + 1 # Once all the lines are processed, print the results print num_sequences, "sequences" print num_CTATA, "with the pattern CTATA" print num_over_1000, "have over 1000 bases" print num_high_GC, "have over 50% GC" print num_over_2000_high_GC, "have over 2000 bases and over 50% GC" =================================== ****** Week1 - Saturday ****** @@ Assignment 6 @@ The data file 'sequences.seq' contains sequences with the bases A, T, C, and G. These are represented with the standard one letter code. Some of the bases are ambiguous, eg, the sequencer couldn't figure out that a base was an A or a T. These are represented with the standard one-letter ambiguity code. This affects the GC calculations because the GC content program written above ignores codes which imply a chance of C or G. For example, "S" means "G or C" so a GC calculation program should count include that base in the GC count. It might also say that since M is "A or C" then it should count 50% for a non-GC and 50% for a GC. The ambiguity may not be exactly 50% so another way to resolve this problem is to find the lowest and highest possible GC values, by first assuming that ambiguity codes are never G or C (except for M) and then assume that they are always a G or C. To solve this exercise, use the answer to week1_wednesday_4.py to find if the sequence contains any unknown bases. ======= week1_saturday_6.py ======= n = 0 for line in open("/usr/coursehome/dalke/sequences.seq"): seq = line.rstrip() known_count = (seq.count("A") + seq.count("T") + seq.count("C") + seq.count("G")) if known_count < len(seq): n = n + 1 print n, "sequences have ambiguous bases" =================================== @@ Assignment 7 @@ ======= week1_saturday_7.py ======= known_letters = "ATCG" unknown_letters = [] # Read every line in the file for line in open("/usr/coursehome/dalke/sequences.seq"): seq = line.rstrip() # Look at every base in the sequence for base in seq: # If the base isn't one of A, T, C, or G if base not in known_letters: # And the base is not in the list of unknown letters if base not in unknown_letters: # Then add it to the end of the list of unknown letters unknown_letters.append(base) print "The ambiguous codes used are", unknown_letters =================================== @@ Assignment 8 @@ ======= week1_saturday_8.py ======= for line in open("/usr/coursehome/dalke/sequences.seq"): seq = line.rstrip() MW = (seq.count("A") * 347.0 + seq.count("C") * 323.0 + seq.count("B") * 336.0 + seq.count("D") * 344.0 + seq.count("G") * 363.0 + seq.count("H") * 330.666666667 + seq.count("K") * 342.5 + seq.count("M") * 335.0 + seq.count("N") * 338.75 + seq.count("S") * 343.0 + seq.count("R") * 355.0 + seq.count("T") * 322.0 + seq.count("W") * 334.5 + seq.count("V") * 344.333333333 + seq.count("Y") * 322.5 + seq.count("X") * 338.75 + 18.0) # add in the terminal H- and -OH contributions if 224245 <= MW and MW <= 226940: print MW, seq =================================== @@ Assignment 10 @@ ======= week1_saturday_10.py ====== print "Search a file for sequences with MW within a given range" filename = raw_input("Filename to search: ") min_MW = float(raw_input("Lower bound for molecular weight: ")) max_MW = float(raw_input("Upper bound for molecular weight: ")) for line in open("/usr/coursehome/dalke/sequences.seq"): seq = line.rstrip() MW = (seq.count("A") * 347.0 + seq.count("C") * 323.0 + seq.count("B") * 336.0 + seq.count("D") * 344.0 + seq.count("G") * 363.0 + seq.count("H") * 330.666666667 + seq.count("K") * 342.5 + seq.count("M") * 335.0 + seq.count("N") * 338.75 + seq.count("S") * 343.0 + seq.count("R") * 355.0 + seq.count("T") * 322.0 + seq.count("W") * 334.5 + seq.count("V") * 344.333333333 + seq.count("Y") * 322.5 + seq.count("X") * 338.75) if min_MW <= MW and MW <= max_MW: print MW, seq =================================== ****** Week2 - Monday ****** @@ Assignment 1 @@ ======= week2_monday_1.py ======= seq = raw_input("Enter DNA: ") counts = {} for base in seq: if base not in counts: counts[base] = 1 else: counts[base] = counts[base] + 1 # Print the items in the dictionary for base in counts: print base, "=", counts[base] =================================== Here's a way someone with more Python experience might write it. ======= week2_monday_1b.py ======= seq = raw_input("Enter DNA: ") counts = {} for base in seq: counts[base] = counts.get(base, 0) + 1 # Print the items in the dictionary for base in counts: print base, "=", counts[base] =================================== @@ Assignment 2 @@ This time we get the sequences from lines in the file, instead of getting it using raw_input. There's also a new line of output. ======= week2_monday_2.py ======= for line in open("/usr/coursehome/dalke/ambiguous_sequences.seq"): seq = line.rstrip() counts = {} for base in seq: if base not in counts: counts[base] = 1 else: counts[base] = counts[base] + 1 # Print the items in the dictionary print "Sequence has", len(seq), "bases" for base in counts: print base, "=", counts[base] =================================== @@ Assignment 3 @@ Add the lines to get the keys (which are the bases) from the dictionary, sort the keys (so the bases are in alphabetical order) then print the list of bases in sorted order. ======= week2_monday_3.py ======= for line in open("/usr/coursehome/dalke/ambiguous_sequences.seq"): seq = line.rstrip() counts = {} for base in seq: if base not in counts: counts[base] = 1 else: counts[base] = counts[base] + 1 # Print the items in the dictionary print "Sequence has", len(seq), "bases" bases = counts.keys() bases.sort() for base in bases: print base, "=", counts[base] =================================== @@ Assignment 4 @@ The biggest change here is to move the counts from inside the for loop to outside the for loop. ======= week2_monday_4.py ======= counts = {} size = 0 for line in open("/usr/coursehome/dalke/ambiguous_sequences.seq"): seq = line.rstrip() size = size + len(seq) for base in seq: if base not in counts: counts[base] = 1 else: counts[base] = counts[base] + 1 # Print the items in the dictionary print "File has", size, "bases" bases = counts.keys() bases.sort() for base in bases: print base, "=", counts[base] =================================== @@ Assignment 5 @@ I ran the following program and got /usr/coursehome/dalke/ambiguous_sequences.seq took 0:00:00.023785 /usr/coursehome/dalke/sequences.seq took 0:00:01.193923 /usr/coursehome/dalke/many_sequences.seq took 0:00:55.269629 ======= week2_monday_5.py ======= import datetime for filename in ["/usr/coursehome/dalke/ambiguous_sequences.seq", "/usr/coursehome/dalke/sequences.seq", "/usr/coursehome/dalke/many_sequences.seq"]: start_time = datetime.datetime.now() counts = {} size = 0 for line in open(filename): seq = line.rstrip() size = size + len(seq) for base in seq: if base not in counts: counts[base] = 1 else: counts[base] = counts[base] + 1 end_time = datetime.datetime.now() print filename, "took", end_time - start_time =================================== For fun, I tried this version, which assumes that most of the time there are only unambiguous codes so first counts the A, T, C, and G bases. If those don't work then count the ambiguous letters. If even those still don't work, test every base the sequence one by one. The new version is much more complex but is over 15% faster. /usr/coursehome/dalke/ambiguous_sequences.seq took 0:00:00.103606 /usr/coursehome/dalke/sequences.seq took 0:00:00.885709 /usr/coursehome/dalke/many_sequences.seq took 0:00:43.007531 ======= week2_monday_5b.py ======= import datetime for filename in ["/usr/coursehome/dalke/ambiguous_sequences.seq", "/usr/coursehome/dalke/sequences.seq", "/usr/coursehome/dalke/many_sequences.seq"]: start_time = datetime.datetime.now() counts = {} size = 0 for line in open(filename): seq = line.rstrip() size = size + len(seq) known_count = 0 for letter in "ATCG": n = seq.count(letter) if n != 0: counts[letter] = counts.get(letter, 0) + n known_count = known_count + n if known_count != len(seq): # Handle the ambiguous letters for letter in "BDHKMNSRWVYX": n = seq.count(letter) if n != 0: counts[letter] = counts.get(letter, 0) + n known_count = known_count + n if known_count != len(seq): # it has something besides the standard 1-letter codes # Figure out what those might be for base in seq: if base not in "ATCGBDHKMNSRWVYX": counts[base] = counts.get(base, 0) + 1 end_time = datetime.datetime.now() print filename, "took", end_time - start_time =================================== @@ Assignment 6 @@ ======= week2_monday_6.py ======= complement_table = {"A": "T", "T": "A", "C": "G", "G": "C"} for line in open("/coursehome/dalke/10_sequences.seq"): seq = line.rstrip() new_seq = [] for base in seq: new_seq.append( complement_table[base] ) new_seq = "".join(new_seq) print new_seq =================================== @@ Assignment 7 @@ ======= week2_monday_7.py ======= ambiguous_dna_complement = { "A": "T", "C": "G", "G": "C", "T": "A", "M": "K", "R": "Y", "W": "W", "S": "S", "Y": "R", "K": "M", "V": "B", "H": "D", "D": "H", "B": "V", "N": "N", } for line in open("/coursehome/dalke/sequences.seq"): seq = line.rstrip() new_seq = [] for base in seq: new_seq.append( ambiguous_dna_complement[base] ) new_seq = "".join(new_seq) print new_seq =================================== @@ Assignment 8 @@ ======= week2_monday_8.py ======= # Start by copying the codon table I gave you. table = { 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', } # The table excludes stop codons so I'll use a "*" for # the translation of those three. stop_codons = [ 'TAA', 'TAG', 'TGA'] for codon in stop_codons: table[codon] = "*" # Get the sequence seq = raw_input("Enter DNA sequence: ") protein = [] for i in range(0, len(seq) - len(seq)%3, 3): protein.append( table[seq[i:i+3]] ) protein = "".join(protein) print protein =================================== ****** Week2 - Tuesday ****** @@ Assignment A @@ ======= week2_tuesday_A.py ======= def add(a, b): return a + b print "2+3 =", add(2, 3) print "5+9 =", add(5, 9) =================================== @@ Assignment B @@ ======= week2_tuesday_B.py ======= def add3(a, b, c): return a + b + c print "2+3+4 =", add3(2, 3, 4) print "5+9+10 =", add3(5, 9, 10) =================================== @@ Assignment C @@ ======= week2_tuesday_C.py ======= def count_bases(seq): counts = {} for base in seq: if base not in counts: counts[base] = 1 else: counts[base] = counts[base] + 1 return counts seq = raw_input("Enter DNA sequence: ") counts = count_bases(seq) # Print the items in the dictionary for base in counts: print base, "=", counts[base] =================================== @@ Assignment D @@ ======= week2_tuesday_D.py ======= def count_bases(seq): counts = {} for base in seq: if base not in counts: counts[base] = 1 else: counts[base] = counts[base] + 1 return counts for line in open("/usr/coursehome/dalke/ambiguous_sequences.seq"): seq = line.rstrip() counts = count_bases(seq) # Print the items in the dictionary print "Sequence has", len(seq), "bases" for base in counts: print base, "=", counts[base] =================================== @@ Assignment E @@ ======= week2_tuesday_E.py ======= # This function returns True if the sequence has the # string "CTATA", otherwise it returns False. def has_CTATA(seq): return "CTATA" in seq # This function returns True if the sequence is over 1000 # bases long, otherwise it return False. def is_over_1000(seq): return len(seq) > 1000 # Return the GC content for a sequence def gc_content(seq): return (seq.count("G") + seq.count("C")) / float(len(seq)) def is_high_GC(seq): return gc_content(seq) > 0.5 def is_over_2000_high_GC(seq): return seq > 2000 and gc_content(seq) > 0.5 num_sequences = 0 num_CTATA = 0 num_over_1000 = 0 num_high_GC = 0 num_over_2000_high_GC = 0 for line in open("/usr/coursehome/dalke/sequences.seq"): seq = line.rstrip() num_sequences = num_sequences + 1 if has_CTATA(seq): num_CTATA = num_CTATA + 1 if is_over_1000(seq): num_over_1000 = num_over_1000 + 1 if is_high_GC(seq): num_high_GC = num_high_GC + 1 if is_over_2000_high_GC: num_over_2000_high_GC = num_over_2000_high_GC + 1 print num_sequences, "sequences" print num_CTATA, "with the pattern CTATA" print num_over_1000, "have over 1000 bases" print num_high_GC, "have over 50% GC" print num_over_2000_high_GC, "have over 2000 bases and over 50% GC" =================================== There is a lot of repetition in the above. Here's a somewhat nice way. ======= week2_tuesday_E2.py ======= def always_true(seq): return True def has_CTATA(seq): return "CTATA" in seq def is_over_1000(seq): return len(seq) > 1000 def gc_content(seq): return (seq.count("G") + seq.count("C")) / float(len(seq)) def is_high_GC(seq): return gc_content(seq) > 0.5 def is_over_2000_high_GC(seq): return seq > 2000 and gc_content(seq) > 0.5 # A list of 3-item lists. The three items are: # - the dictionary key name to use for a given property # - the descriptive text for that field # - the function to use to test if a sequence has that property fields = [ ["total", "sequences", always_true], ["CTATA", "with the pattern CTATA", has_CTATA], ["over_1000", "have over 1000 bases", is_over_1000], ["high_GC", "have over 50% GC", is_high_GC], ["over_2000_high_GC", "have over 2000 bases and over 50% GC", is_over_2000_high_GC], ] # Initialize the counts to 0 seq_counts = {} for field in fields: name = field[0] seq_counts[name] = 0 for line in open("/usr/coursehome/dalke/sequences.seq"): seq = line.rstrip() for field in fields: name = field[0] function = field[2] if function(seq): seq_counts[name] = seq_counts[name] + 1 for field in fields: name = field[0] description = field[1] print seq_counts[name], description =================================== I can simplify it with a special notation for iterating over lists and using a trick that the numeric value of True is 1 and of False is 0 ======= week2_tuesday_E3.py ======= def always_true(seq): return True def has_CTATA(seq): return "CTATA" in seq def is_over_1000(seq): return len(seq) > 1000 def gc_content(seq): return (seq.count("G") + seq.count("C")) / float(len(seq)) def is_high_GC(seq): return gc_content(seq) > 0.5 def is_over_2000_high_GC(seq): return seq > 2000 and gc_content(seq) > 0.5 # A list of 3-item lists. The three items are: # - the dictionary key name to use for a given property # - the descriptive text for that field # - the function to use to test if a sequence has that property fields = [ ["total", "sequences", always_true], ["CTATA", "with the pattern CTATA", has_CTATA], ["over_1000", "have over 1000 bases", is_over_1000], ["high_GC", "have over 50% GC", is_high_GC], ["over_2000_high_GC", "have over 2000 bases and over 50% GC", is_over_2000_high_GC], ] # Initialize the counts to 0 seq_counts = {} for field in fields: seq_counts[field[0]] = 0 for line in open("/usr/coursehome/dalke/sequences.seq"): seq = line.rstrip() for name, description, function in fields: seq_counts[name] = seq_counts[name] + function(seq) for name, description, function in fields: print seq_counts[name], description =================================== @@ Assignment F @@ ======= week2_tuesday_F.py ======= table = { 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', } stop_codons = [ 'TAA', 'TAG', 'TGA'] for codon in stop_codons: table[codon] = "*" # A function to convert a DNA sequence into protein def translate_dna(seq): protein = [] for i in range(0, len(seq) - len(seq)%3, 3): protein.append( table[seq[i:i+3]] ) return "".join(protein) # Get the sequence seq = raw_input("Enter DNA sequence: ") # Print the translated sequence print translate_dna(seq) ================================= ****** Week2 - Wednesday ****** @@ Assignment 30 @@ ======= seq_functions.py ======= BASES = "ATCG" def GC_content(s): return (s.count("G") + s.count("C")) / float(len(s)) ================================ ======= week2_wednesday_30.py ====== import seq_functions seq = raw_input("Enter DNA sequence: ") print "%GC content:", seq_functions.GC_content(seq) =================================== @@ Assignment 31 @@ ======= seq_functions.py ====== BASES = "ATCG" def GC_content(s): return (s.count("G") + s.count("C")) / float(len(s)) def count_bases(seq): counts = {} for base in seq: if base not in counts: counts[base] = 1 else: counts[base] = counts[base] + 1 return counts def print_bases(counts): keys = counts.keys() keys.sort() for base in keys: print base + ":", counts[base] =================================== ======= week2_wednesday_31.py ====== import seq_functions seq = raw_input("Enter DNA sequence: ") print "%GC content:", seq_functions.GC_content(seq) counts = seq_functions.count_bases(seq) seq_functions.print_bases(counts) =================================== @@ Assignment 32 @@ ======= seq_functions.py ====== BASES = "ATCG" def GC_content(s): return (s.count("G") + s.count("C")) / float(len(s)) def count_bases(seq): counts = {} for base in seq: if base not in counts: counts[base] = 1 else: counts[base] = counts[base] + 1 return counts def print_bases(counts): keys = counts.keys() keys.sort() for base in keys: print base + ":", counts[base] def always_true(seq): return True def has_CTATA(seq): return "CTATA" in seq def is_over_1000(seq): return len(seq) > 1000 def gc_content(seq): return (seq.count("G") + seq.count("C")) / float(len(seq)) def is_high_GC(seq): return gc_content(seq) > 0.5 def is_over_2000_high_GC(seq): return seq > 2000 and gc_content(seq) > 0.5 =================================== ======= week2_monday_32.py ======= import seq_functions fields = [ ["total", "sequences", seq_functions.always_true], ["CTATA", "with the pattern CTATA", seq_functions.has_CTATA], ["over_1000", "have over 1000 bases", seq_functions.is_over_1000], ["high_GC", "have over 50% GC", seq_functions.is_high_GC], ["over_2000_high_GC", "have over 2000 bases and over 50% GC", seq_functions.is_over_2000_high_GC], ] # Initialize the counts to 0 # This version uses some Python I haven't taught you! seq_counts = dict.fromkeys( [field[0] for field in fields], 0) for line in open("/usr/coursehome/dalke/sequences.seq"): seq = line.rstrip() for name, description, function in fields: seq_counts[name] = seq_counts[name] + function(seq) for name, description, function in fields: print seq_counts[name], description =================================== ****** Week2 - Thursday ****** @@ Assignment 1 @@ There were at least two mistakes. In week2_tuesday_E.py this if is_over_2000_high_GC: num_over_2000_high_GC = num_over_2000_high_GC + 1 was missing the function call and should have been if is_over_2000_high_GC(): num_over_2000_high_GC = num_over_2000_high_GC + 1 and in week1_saturday_8.py the molecular weight code should have added the 18.0 for the water. Also corrected in this file. @@ Assignment 3a @@ The number of records is equal to the number of lines in the file which start with a >. ======= week2_thursday_3a.py ======= num_records = 0 filename = "ls_orchid.fasta" for line in open(filename): if line.startswith(">"): num_records = num_records + 1 print filename, "has", num_records, "records" ==================================== The other lines are the sequence line and the empty line. I can get the total number of bases by counting the characters (excluding the newline) in lines which aren't the title line. ======= week2_thursday_3b.py ===== num_records = 0 num_bases = 0 filename = "ls_orchid.fasta" for line in open(filename): if line.startswith(">"): num_records = num_records + 1 else: num_bases = num_bases + len(line.rstrip()) print filename, "has", num_records, "records and", num_bases ================================== ======= week2_thursday_4.py ===== filename = "ls_orchid.fasta" for line in open(filename): if line.startswith(">"): # the [1:] gets rid of the leading > title = line.rstrip()[1:] print title ================================== ======= week2_thursday_5.py ===== filename = "ls_orchid.fasta" in_record = False for line in open(filename): if line.startswith(">"): if "P.tonsum" in line: in_record = True if in_record: print line.rstrip() # End of a FASTA record if line == "\n" and in_record: print_record ================================== ======= week2_thursday_6.py ===== import sys print float(sys.argv[1]) + float(sys.argv[2]) ================================= ======= week2_thursday_6.py ===== import sys search_text = sys.argv[1] filename = "ls_orchid.fasta" in_record = False for line in open(filename): if line.startswith(">"): if search_text in line: in_record = True if in_record: print line.rstrip() # End of a FASTA record if line == "\n" and in_record: print_record ================================== ======= week2_thursday_7.py ===== filename = "ls_orchid.fasta" title = None for line in open(filename): line = line.rstrip() if line.startswith(">"): title = line[1:] # remove the leading ">" sequence = "" elif line == "": # The blank line # end of a record, check if the previous sequence line ends with "C" if sequence.endswith("C"): print title else: # Must be a sequence line. Save it to test if the next line # is the end of the record sequence = line ================================= ****** Week2 - Friday ****** There were no exercises for the regular expression lecture. ****** Week2 - Saturday ****** @@ Assignment 1 @@ This uses a regular expression to test if there are 5 hydrophobics in a row. ======= week2_saturday_1.py ======= import re seq = raw_input("Protein sequence? ") if re.search(r"[FILAPVM]{5}", seq): print "Hydrophobic signal" else: print "No hydrophobic signal" =================================== @@ Assignment 2 @@ ======= week2_saturday_2.py ======= import re seq = raw_input("Protein sequence? ") if re.search(r"[FILAPVM]{7}", seq): print "Strong hydrophobic signal" elif re.search(r"[FILAPVM]{3}", seq): print "Weak hydrophobic signal" else: print "No hydrophobic signal" =================================== @@ Assignment 3 @@ The pattern is C.{2,4}C.{3}[LIVMFYWC].{8}H.{3,5} Going from left to right this means C -> C .{2,4} -> AA C -> C .{3} -> AAA [LIVMFYWC] -> L .{8} -> AAAAAAAA H -> H .{3,5} -> AAA --> CAACAAALAAAAAAAAHAAA should work. Testing in Python >>> import re >>> if re.match(r"C.{2,4}C.{3}[LIVMFYWC].{8}H.{3,5}", ... "CAACAAALAAAAAAAAHAAA"): ... print "It matches." ... It matches. >>> @@ Assignment 4 @@ ======= week2_saturday_4.py ======= seq = raw_input("Enter DNA sequence: ") start = 0 while 1: # Find the start of the next restriction site pos = seq.find("GAATTC", start) if pos == -1: # Reached the end of the string. # Print what's left and exit the loop. print seq[start:] break ## Found the start of the site. # Add in the offset to the first T in TT pos = pos + 4 # Print the newly cut sequence print seq[start:pos] # And restart at the newly 5' T start = pos ===================================