|
A real format
SWISS-PROT 38
"""Parser for the SWISS-PROT 38 format.
You probably want to use the variables 'record' (for a single record)
and 'format' (for a set of records).
"""
import Martel
from Martel import RecordReader, Time
from Bio import Std
def Simple(tag, tag_data):
return Martel.Group(tag,
Martel.Str(tag + " ") + \
Martel.ToEol(tag_data)
)
#--- ID
ID = Martel.Group("ID",
Martel.Str("ID ") + \
Std.dbid(Martel.Word("entry_name"), {"type": "primary",
"dbname": "sp"}) + \
Martel.Spaces() + \
Martel.Word("data_class_table") + \
Martel.Str(";") + Martel.Spaces() + \
Martel.Word("molecule_type") + \
Martel.Str(";") + Martel.Spaces() + \
Martel.Digits("sequence_length") + \
Martel.Str(" AA.") + \
Martel.AnyEol()
)
#--- AC
AC = Martel.Group("AC",
Martel.Str("AC ") + \
Std.dbid(Martel.Word("ac_number"),
{"type": "accession",
"dbname": "sp"}) + \
Martel.Str(";") + \
Martel.Rep(Martel.Str(" ") + \
Std.dbid(Martel.Word("ac_number"),
{"type": "accession"}) + \
Martel.Str(";")) + \
Martel.AnyEol())
AC_block = Martel.Group("AC_block", Martel.Rep1(AC))
#--- DT
##DT_created = Martel.Group("DT_created", Martel.Re(
## r"DT (?P<day>\d\d)-(?P<month>...)-(?P<year>\d{4}) \(Rel. "\
## r"(?P<release>\d\d), Created\)\R"
## ))
DT_created = Martel.Group("DT_created",
Martel.Str("DT ") + \
Time.make_expression("%(DD)-%(Jan)-%(YYYY)") + \
Martel.Re(" \(Rel. (?P<release>\d\d), Created\)\R"))
DT_seq_update = Martel.Group("DT_seq_update", Martel.Re(
r"DT (?P<day>\d\d)-(?P<month>...)-(?P<year>\d{4}) \(Rel. "\
r"(?P<release>\d\d), Last sequence update\)\R"
))
DT_ann_update = Martel.Group("DT_ann_update", Martel.Re(
r"DT (?P<day>\d\d)-(?P<month>...)-(?P<year>\d{4}) \(Rel. "\
r"(?P<release>\d\d), Last annotation update\)\R"
))
#--- DE
# Only the last DE is supposed to have a ".", but I don't see why *I*
# need to enforce it.
DE = Martel.Group("DE",
Martel.Str("DE ") + \
Std.description(Martel.UntilEol("description")) + \
Martel.AnyEol())
DE_block = Std.description_block(Martel.Group("DE_block", Martel.Rep1(DE)))
#--- GN
GN = Simple("GN", "gene_names")
GN_block = Martel.Group("GN_block", Martel.Rep1(GN))
#--- OS
OS = Simple("OS", "organism_species")
OS_block = Martel.Group("OS_block", Martel.Rep1(OS))
#--- OG
OG = Simple("OG", "organelle")
OG_block = Martel.Group("OG_block", Martel.Rep1(OG))
#--- OC
OC = Simple("OC", "organism_classification")
OC_block = Martel.Group("OC_block", Martel.Rep1(OC))
############ Reference section
#--- RN
# occurs once
RN = Martel.Group("RN", Martel.Re("RN \[(?P<reference_number>\d+)]\R"))
#--- RP
# occurs once
RP = Simple("RP", "reference_position")
#--- RC
# 0 or more
RC = Simple("RC", "reference_comment")
RC_block = Martel.Group("RC_block", Martel.Rep1(RC))
#--- RX
# 0 or 1
RX = Martel.Group("RX",
Martel.Re("RX (?P<bibliographic_database_name>\w+); " \
"(?P<bibliographic_identifier>\d+)\.\R"))
#--- RA
# 1 or more
RA = Simple("RA", "reference_author")
RA_block = Martel.Group("RA_block", Martel.Rep1(RA))
#--- RT
# 0 or more
RT = Simple("RT", "reference_title")
RT_block = Martel.Group("RT_block", Martel.Rep1(RT))
#--- RL
# 1 or more
RL = Simple("RL", "reference_location")
RL_block = Martel.Group("RL_block", Martel.Rep1(RL))
reference = Martel.Group("reference",
RN + \
RP + \
Martel.Opt(RC_block) + \
Martel.Opt(RX) + \
RA_block + \
Martel.Opt(RT_block) + \
RL_block
)
############
#--- CC
CC_begin = Martel.Group("CC",
Martel.Re("CC -!- ") + \
Martel.ToEol("comment_text"))
CC = Martel.Group("CC",
Martel.Re("CC ") + \
Martel.ToEol("comment_text"))
single_comment = Martel.Group("comment",
CC_begin +
Martel.Rep(CC)
)
CC_copyright_begin = Martel.Group("CC_copyright_begin",
Martel.Re("CC -+\R"))
CC_copyright = Martel.Group("CC_copyright",
Martel.Re("CC (?!-+\R)") + \
Martel.ToEol("copyright"))
CC_copyright_end = Martel.Group("CC_copyright_end",
Martel.Re("CC -+\R"))
# From N33_HUMAN
bogus_DR_group = Martel.Group("bogus_DR_block",
Martel.Re(r"(?P<DR>DR (?P<database_identifier>MIM); " \
r"(?P<primary_identifier>601385); " \
r"(?P<secondary_identifier>-).\R)")
)
comment = Martel.Group("comment_block",
Martel.Rep(single_comment) + \
Martel.Opt(bogus_DR_group) + \
Martel.Opt(CC_copyright_begin + \
Martel.Rep(CC_copyright) + \
CC_copyright_end \
)
)
#--- DR
# This is needed for things like
# DR MGD; MGI:95401; EPB4.1.
# where I need to scan up to the last "." That is, I want
# "EPB4.1" to be the secondary identifier, not "EPB4" nor "EPB4.1."
_to_secondary_end = Martel.Re(r"([^.\R]|(?!.\R)\.)+")
database_id = Std.dbxref_dbname(Martel.UntilSep("database_identifier", ";"),
{"style": "sp"})
primary_id = Std.dbxref_dbid(Martel.UntilSep("primary_identifier", ";"),
{"type": "primary"})
secondary_id = Std.dbxref_dbid(Martel.Group("secondary_identifier",
_to_secondary_end),
{"type": "accession"})
# used in StdHandler for fast dxbref - don't rename!
real_DR_general = Std.dbxref(database_id + Martel.Str("; ") + \
primary_id + Martel.Str("; ") + \
secondary_id,
)
fast_DR_general = Std.fast_dbxref(real_DR_general,
{"style": "sp-general"})
DR_general = Martel.FastFeature(fast_DR_general, "fast-sp-dbxref",
real_DR_general.group_names() )
# used in StdHandler for fast dxbref - don't rename!
real_DR_prosite = Std.dbxref(
Std.dbxref_dbname(Martel.Group("database_identifier",
Martel.Str("PROSITE", "PFAM")),
{"style": "sp"}) +
Martel.Str("; ") +
primary_id +
Martel.Str("; ") +
Std.dbxref_dbid(Martel.UntilSep(sep = ";"), {"type": "accession"}) +
Martel.Str("; ") +
Martel.UntilSep("status_identifier", "."),
)
# used in StdHandler for fast dxbref - don't rename!
fast_DR_prosite = Std.fast_dbxref(real_DR_prosite, {"style": "sp-prosite"})
DR_prosite = Martel.FastFeature(fast_DR_prosite, "fast-sp-dbxref",
real_DR_prosite.group_names())
real_DR_embl = Std.dbxref(
Std.dbxref_dbname(Martel.Group("database_identifier",
Martel.Str("EMBL")),
{"style": "sp"}) +
Martel.Str("; ") +
primary_id +
Martel.Str("; ") +
Std.dbxref_dbid(Martel.UntilSep("secondary_identifier", ";"),
{"type": "accession"}) +
Martel.Str("; ") +
Martel.UntilSep("status_identifier", "."),
)
fast_DR_embl = Std.fast_dbxref(real_DR_embl, {"style": "sp-embl"})
DR_embl = Martel.FastFeature(fast_DR_embl, "fast-sp-dbxref",
real_DR_embl.group_names())
DR = Martel.Group("DR", Martel.Str("DR ") + \
Martel.Group("database_reference",
DR_embl | DR_prosite | DR_general) + \
Martel.Str(".") + Martel.AnyEol())
DR_block = Martel.Group("DR_block", Martel.Rep1(DR))
#--- KW
KW = Simple("KW", "keyword")
KW_block = Martel.Group("KW_block", Martel.Rep1(KW))
#--- FT
# FT DOMAIN 77 88 ASP/GLU-RICH (ACIDIC).
# 123456789012345678901234567890123456789012345678901234567890123456789012345
# 1 2 3 4 5 6 7
# FT ........ ...... ...... .........................................
# FT 12345678 123456 123456 12345678901234567890123456789012345678901
# FT .{8} .{6} .{6} [^\R]*
# 1 1 1234567
# "FT " + ".{8}" + " " + ".{6}" + " " + ".{6}" + " " + "[^\R]*" + "\R"
# "FT .{8} .{6} .{6} [^\R]*\R"
##FT_range = Martel.Group("FT",
## Martel.Re("FT (?P<ft_name>.{8}) " \
## "(?P<ft_from>.{6}) (?P<ft_to>.{6})" \
## "( (?P<ft_description>[^\R]*))?\R")
## )
##FT_continuation = Martel.Group("FT_continuation",
## Martel.Re("FT " \
## "(?P<ft_description>[^\R]*)\R")
## )
##FT = Martel.Group("feature", FT_range + Martel.Rep(FT_continuation))
FT_name = Std.feature_name(Martel.Re(r".{8}"))
FT_start = Std.feature_location_start(Martel.Re(r".{6}"))
FT_end = Std.feature_location_end(Martel.Re(r".{6}"))
FT_desc = Std.feature_description(Martel.UntilEol())
FT_range = Martel.Str("FT ") + \
FT_name + \
Martel.Str(" ") + \
FT_start + \
Martel.Str(" ") + \
FT_end + \
Martel.Opt(Martel.Str(" ") + \
FT_desc) + \
Martel.AnyEol()
FT_continuation = Martel.Str("FT ") + \
FT_desc + \
Martel.AnyEol()
FT = Std.feature(FT_range + Martel.Rep(FT_continuation),
{"location-style": "sp"})
##feature_block = Martel.Group("feature_block", Martel.Rep1(FT))
feature_block = Std.feature_block(Martel.Rep1(FT),
{"style": "swissprot"})
#--- SQ
# SQ SEQUENCE XXXX AA; XXXXX MW; XXXXX CRC32;
# (Those X's don't really indicate the size)
SQ = Martel.Group("SQ",
Martel.Re("SQ SEQUENCE +(?P<sequence_length>\d+) AA;" \
" +(?P<molecular_weight>\d+) MW;" \
" +(?P<crc?type=32>\w+) CRC32;\R")
)
##SQ_data = Martel.Group("SQ_data",
## Martel.Re(" (?P<sequence>[^\R]*)\R"))
SQ_data = Martel.Str(" ") + \
Std.sequence(Martel.UntilEol()) + \
Martel.AnyEol()
##sequence = Martel.Group("sequence_block", Martel.Group("SQ_data_block",
## SQ + Martel.Rep(SQ_data)))
sequence = Std.sequence_block(SQ + Martel.Rep(SQ_data),
{"alphabet": "iupac-ambiguous-protein"})
#--- //
end = Martel.Group("END", Martel.Str("//") + Martel.AnyEol())
####################### put it all together
record = Std.record(
ID +
AC_block +
DT_created +
DT_seq_update +
DT_ann_update +
Martel.Opt(DE_block) +
Martel.Opt(GN_block) +
Martel.Opt(OS_block) +
Martel.Opt(OG_block) +
Martel.Opt(OC_block) +
Martel.Group("OX_block", Martel.NullOp()) +
Martel.Group("reference_block", Martel.Rep(reference)) +
comment +
Martel.Opt(DR_block) +
Martel.Opt(KW_block) +
Martel.Opt(feature_block) +
sequence +
end,
{"format": "swissprot/38"})
format_expression = Martel.Group("dataset", Martel.Rep1(record),
{"format": "swissprot/38"})
format = Martel.ParseRecords("dataset", {"format": "swissprot/38"},
record, RecordReader.EndsWith, ("//\n",) )
if __name__ == "__main__":
exp = Martel.select_names(format, ("entry_name", "sequence"))
parser = exp.make_parser()
parser.parseFile(open("/home/dalke/ftps/swissprot/sprot38.dat"))
|
|