Index: scripts/ama-qvalues.txt =================================================================== --- scripts/ama-qvalues.txt (revision 0) +++ scripts/ama-qvalues.txt (revision 4529) @@ -0,0 +1,128 @@ +#!@WHICHPERL@ +# AUTHOR: Timothy L. Bailey +# CREATE DATE: May 7, 2010 +#use strict; +use File::Basename; + +$PGM = $0; # name of program +$PGM =~ s#.*/##; # remove part up to last slash +#@args = @ARGV; # arguments to program +$| = 1; # flush after all prints +$SIG{'INT'} = \&cleanup; # interrupt handler +# Note: so that interrupts work, always use for system calls: +# if ($status = system("$command")) {&cleanup($status)} + +# requires +push(@INC, split(":", $ENV{'PATH'})); # look in entire path + +# defaults +my $seed = 1; +my $bootstraps = 1000; + +my $usage = <] seed for random numbers; default: $seed + [-bootstraps ] number of bootstraps to perform + while computing pi_0; default: 1000 + + Read an AMA output file and output each sequence name, p-value + and q-value. At the end of file, the value of pi_0 (number of + sequences not showing significant binding to motif) is shown, + unless there are fewer than 100 p-values in the input. + + Reads standard input. + Writes standard output. + + Copyright + (2010) The University of Queensland + All Rights Reserved. + Author: Timothy L. Bailey +USAGE + +$nargs = 0; # number of required args +if ($#ARGV+1 < $nargs) { &print_usage("$usage", 1); } + +# get input arguments +while ($#ARGV >= 0) { + $_ = shift; + if ($_ eq "-h") { # help + &print_usage("$usage", 0); + } elsif ($_ eq "-seed") { + $seed = shift; + } elsif ($_ eq "-bootstraps") { + $bootstraps = shift; + } else { + &print_usage("$usage", 1); + } +} + +# open a pipe to the qvalue program, reading standard input +$pzfile = "$PGM.$$.pzfile.tmp"; +open(QVALUE, "|qvalue --append --column 2 --pi-zero-file $pzfile --seed $seed --bootstraps $bootstraps -") || + die("Can't open qvalue program.\n"); + +# compute the qvalues +print "# sequence_name\t\t\tp-value\t\tq-value\n"; +my($pvalue, @w, $name); +my $npvalues = 0; +while () { + if (/pvalue="([^"]*)"/) { + $pvalue = $1; + /name="([^"]*)"/; + $name = $1; + print(QVALUE "$name\t$pvalue\n"); + $npvalues++; + } +}; +close(QVALUE); + +printf("# Fraction of sequences not showing significant binding: "); +system("cat $pzfile") if ($npvalues >= 100); +unlink $pzfile; + +$status = 1; + +# cleanup files +&cleanup($status, ""); + +################################################################################ +# Subroutines # +################################################################################ + +################################################################################ +# +# print_usage +# +# Print the usage message and exit. +# +################################################################################ +sub print_usage { + my($usage, $status) = @_; + + if (-c STDOUT) { # standard output is a terminal + open(C, "| more"); + print C $usage; + close C; + } else { # standard output not a terminal + print STDERR $usage; + } + + exit $status; +} + +################################################################################ +# cleanup +# +# cleanup stuff +# +################################################################################ +sub cleanup { + my($status, $msg) = @_; + if ($status eq "INT") { + exit(1); + } else { + if ($status && "$msg") {print STDERR "$msg: $status\n";} + } +} Index: scripts/fasta-subsample.txt =================================================================== --- scripts/fasta-subsample.txt (revision 0) +++ scripts/fasta-subsample.txt (revision 4529) @@ -0,0 +1,142 @@ +#!/usr/bin/perl +##!@WHICHPERL@ + +my $pgm = $0; # name of program +$pgm =~ s#.*/##; # remove part up to last slash +my $seed = 1; # random number seed +my $line_size = 50; # size of lines to print +my $off = 1; # first position of sequence to print +my $len = -1; # maximum length of sequence to print +my $rest = ""; # file to receive remainder of seqs + +$usage = < [-seed ] [-rest ] + [-off ] [-len ] + + name of FASTA sequence file + number of sequences to output + [-seed ] random number seed; default: $seed + [-rest ] name of file to receive the FASTA + sequences not being output; default: none + [-off ] print starting at position in each + sequence; default: $off + [-len ] print up to characters for each + sequence; default: print entire sequence + + Output a random subsample of size of the sequences in + a FASTA sequence file. The seed of the random generator can be + changed using -seed, otherwise the same subset of sequences will + always be output. If requested, the remaining sequences will + be output to a file named , which is useful for + cross-validation. + + You can also choose to only output portions of each sequence + using the -off and -len switches. + + Writes to standard output. +USAGE + +if ($#ARGV+1 < 2) { # wrong number of arguments + die $usage; +} + +# get input arguments +my $fasta = shift; # name of fasta file +my $n = shift; # size of subsample +while ($#ARGV >= 0) { + $_ = shift; + if ($_ eq "-seed") { # random seed + $seed = shift; + } elsif ($_ eq "-rest") { # rest file name + $rest = shift; + } elsif ($_ eq "-off") { # offset + $off = shift; + } elsif ($_ eq "-len") { # maximum length + $len = shift; + } elsif ($_ eq "-rest") { # file for remainder + $rest = shift; + } else { + die $usage; + } +} + +# read in FASTA file and make index +open(FASTA, "<$fasta") || die "Couldn't open file `$fasta'.\n"; +my $byte = 0; +my %index; # ID-to-start index +my $id; # sequence ID +my @rest; # dummy +my @id_list; # list of all IDs +while () { + if (/^>/) { + ($id, @rest) = split; + $index{$id} = $byte; # start of sequence record + push @id_list, $id; + } + $byte += length; +} # read FASTA file + +# check that there are enough IDs +my $nseqs = @id_list; +die ("Not enough sequences ($nseqs); $n requested.\n") if ($nseqs < $n); + +# shuffle the list of IDs +srand($seed); +shuffle(\@id_list); +#print join " ", @id_list, "\n"; + +# output the requested number of FASTA sequences to STDOUT +foreach $id (@id_list[0..$n-1]) { + print_fasta_seq_portion(*FASTA, *STDOUT, $id, $off, $len, \%index); +} # id + +# output the remainder of the sequences if requested +# to the "rest" file +if ($rest) { + open(REST, ">$rest") || die("Can't open file `$rest'.\n"); + foreach $id (@id_list[$n..$nseqs-1]) { + print_fasta_seq_portion(*FASTA, *REST, $id, $off, $len, \%index); + } +} + +################################################################################ +# print (a portion of) a FASTA sequence +# Assumes FASTA file is open and the index contains +# the file byte offset for a given ID. +sub print_fasta_seq_portion { + my ($fasta, $output, $id, $off, $len, $index) = @_; + + my $addr = $index{$id}; # address of sequence + die "Can't find target $id.\n" if ($addr eq undef); + seek($fasta, $addr, 0); # move to start of target + $_ = <$fasta>; + print $output $_; # print ID for this sequence + my $seq = ""; + # read in sequence lines for this sequence + while (<$fasta>) { # read sequence lines + if (/^>/) {last} # start of next sequence + chop; + $seq .= $_; + } + # print sequence in lines of length $line_size + # get portion of sequence to print if -off and/or -len given + if ($off != 1 || $len != -1) { + if ($len == -1) { + $seq = substr($seq, $off-1); + } else { + $seq = substr($seq, $off-1, $len); + } + } + for ($i=0; $i $@ +ama-qvalues: ama-qvalues.txt Makefile + $(SED) $(SEDSPEC) $< > $@ cat_max: cat_max.txt Makefile $(SED) $(SEDSPEC) $< > $@ chen2meme: chen2meme.pl Makefile @@ -134,6 +140,8 @@ $(SED) $(SEDSPEC) $< > $@ fasta-shuffle-letters: fasta-shuffle-letters.txt Makefile $(SED) $(SEDSPEC) $< > $@ +fasta-subsample: fasta-subsample.txt Makefile + $(SED) $(SEDSPEC) $< > $@ fasta-unique-names: fasta-unique-names.txt Makefile $(SED) $(SEDSPEC) $< > $@ get_db_csv: get_db_csv.txt Makefile Index: doc/release-notes.html =================================================================== --- doc/release-notes.html (revision 4519) +++ doc/release-notes.html (working copy) @@ -28,6 +28,29 @@

MEME Suite System Release Notes


+ MEME version NEXT +

+
    +
  • + fasta-subsample --
    + New script for extracting a random sampling of the sequences + in a FASTA file. This is especially useful for ChIP-seq + peak datasets to be input to MEME. Using this script, a + FASTA file containing a subset of the ChIP-seq peak sequences + (or any other FASTA file) can be created. The total + number of sequences should be less than 1000 (peferably less + than 500), and the total sequence length should be less + than a few hundred thousand. MEME typically takes about 20 minutes + per motif with files of 100,000 characters (DNA, both strands, ZOOPS model), + and scales quadratically in the total sequence length (so a file of + 200,000 characters will take four times as long.) + This new script can also output the remaining sequences (in a sepqrate + file) for use in cross-validation. +
  • +
+ +
+

MEME version 4.4.0 -- April 23, 2010

    Index: doc/mast.html =================================================================== --- doc/mast.html (revision 4529) +++ doc/mast.html (working copy) @@ -87,6 +87,7 @@ [-minseqs <ms>]lower bound on number of sequences in db [-nostatus]do not print progress report + [-notext]do not create text output [-nohtml]do not create html output

    @@ -113,7 +114,8 @@ for human viewing. The text format is available for backwards compatibility though due to design decisions made to optimise the xml for html generation the output for separate scoring mode is not identical and some options were - removed. + removed. The text format will be unsupported in future releases and so we + recommend you migrate any programs reading mast output to the xml format.

    MAST outputs three things: Index: doc/mast2txt.html =================================================================== --- doc/mast2txt.html (revision 4529) +++ doc/mast2txt.html (working copy) @@ -16,7 +16,8 @@

    Usage: mast2txt <mast xml file> <output text file>

    Description:

    -Convert a mast xml file into a mast text file. +Provides backwards compatibility by converting a mast xml file into a mast text file. +Warning: Mast text format will not be supported in the next release.

    Input:

    <mast xml file> - An xml file created by the mast program.

    Index: src/mast2txt.c =================================================================== --- src/mast2txt.c (revision 4529) +++ src/mast2txt.c (working copy) @@ -15,7 +15,7 @@ #include "utils.h" /*debugging macros {{{*/ -VERBOSE_T verbosity = HIGH_VERBOSE; +VERBOSE_T verbosity = QUIET_VERBOSE; BOOLEAN_T status = TRUE; time_t status_last = 0; //last time a status message was output time_t status_delay = 5; //minimum time between status messages @@ -2234,7 +2234,7 @@ int main(int argc, char **argv) { - + int result; //Using SAX scan through the xml document //load the model parameters, //load the alphabet @@ -2244,11 +2244,12 @@ //keep a ordered tree of scores which haven't been inserted yet and write out the data to file //and keep a pointer to it, if (argc == 3) { - parse_xml_file(argv[1], argv[2]); + result = parse_xml_file(argv[1], argv[2]); } else { fprintf(stdout, "mast2txt \n"); + result = 1; } - return 0; + return result; } /*}}}*/ Index: src/mast.c =================================================================== --- src/mast.c (revision 4529) +++ src/mast.c (working copy) @@ -98,6 +98,8 @@ static const char *XML_FILENAME = "mast.xml"; static const char *HTML_STYLESHEET = "mast-to-html.xsl"; static const char *HTML_FILENAME = "mast.html"; +static const char *TXT_FILENAME = "mast.txt"; +static const char *MAST2TXT_FILENAME = "mast2txt"; /* MAST DTD */ /*{{{*/ @@ -1692,6 +1694,7 @@ BOOLEAN lump = FALSE; /* combine spacings into one p-value */ BOOLEAN status = TRUE; /* show progress */ BOOLEAN html = TRUE; /* generate html output */ + BOOLEAN mast2txt = TRUE; /* run mast2txt on the xml output */ STYPE stype = Combine; /* handling of DNA strands */ BOOLEAN translate_dna = FALSE; /* don't translate DNA sequences */ BOOLEAN best_motifs = FALSE; /* only print best motif in diagrams */ @@ -1803,6 +1806,7 @@ DATA_OPTN(1, minseqs, , \tlower bound on number of sequences in db, min_seqs = atoi(_OPTION_)); FLAG_OPTN(1, nostatus, \tdo not print progress report, status = FALSE); + FLAG_OPTN(1, notext, \tdo not generate text output, mast2txt = FALSE); FLAG_OPTN(1, nohtml, \tdo not generate html output, html = FALSE); // DEBUG AND EXPERIMENTAL OPTIONS FLAG_OPTN(EXP, shuffle, \tshuffle columns of motifs, shuffle = TRUE); @@ -2004,6 +2008,27 @@ // finish xml output if (mast_out != stdout) fclose(mast_out); + // finish status report + if (status) fprintf(stderr, "\n"); + + //cleanup + for (i = 0; i < arraylst_size(sequences); ++i) { + SSEQ_T *sseq = (SSEQ_T*)arraylst_get(i, sequences); + sseq_destroy(sseq, nmotifs); + } + arraylst_destroy(sequences); + for (i = 0; i < arraylst_size(databases); ++i) { + DATABASE_T *db = (DATABASE_T*)arraylst_get(i, databases); + myfree(db->source); + myfree(db->name); + myfree(db->link); + if (db->save) fclose(db->save); + if (db->file != stdin) fclose(db->file); + myfree(db); + } + arraylst_destroy(databases); + + //output alternate formats if (xml_path && html) { char *stylesheet_path, *html_path; stylesheet_path = make_path_to_file(ETC_DIR, HTML_STYLESHEET); @@ -2022,28 +2047,30 @@ myfree(stylesheet_path); myfree(html_path); } + if (xml_path && mast2txt) { + char *mast2txt_path, *txt_path; + mast2txt_path = make_path_to_file(BIN_DIR, MAST2TXT_FILENAME); + txt_path = make_path_to_file(out_dir, TXT_FILENAME); + if (file_exists(mast2txt_path)) { + char *cmd; + int len; + len = strlen(mast2txt_path) + strlen(xml_path) + strlen(txt_path) + 9; + cmd = mm_malloc(sizeof(char) * len); + snprintf(cmd, len, "\"%s\" \"%s\" \"%s\"", mast2txt_path, xml_path, txt_path); + if (system(cmd)) { + fprintf(stderr, "Warning: program mast2txt failed to run.\n"); + } + myfree(cmd); + } else { + if (verbosity >= NORMAL_VERBOSE) + fprintf(stderr, "Warning: could not find the program \"%s\" required for transformation of xml to txt. Have you installed %s correctly?\n", + mast2txt_path, program_name); + } + myfree(mast2txt_path); + myfree(txt_path); + } - // finish status report - if (status) fprintf(stderr, "\n"); - - //cleanup - for (i = 0; i < arraylst_size(sequences); ++i) { - SSEQ_T *sseq = (SSEQ_T*)arraylst_get(i, sequences); - sseq_destroy(sseq, nmotifs); - } - arraylst_destroy(sequences); - for (i = 0; i < arraylst_size(databases); ++i) { - DATABASE_T *db = (DATABASE_T*)arraylst_get(i, databases); - myfree(db->source); - myfree(db->name); - myfree(db->link); - if (db->save) fclose(db->save); - if (db->file != stdin) fclose(db->file); - myfree(db); - } - arraylst_destroy(databases); myfree(xml_path); - return 0; } /* main */ Index: etc/mast.sh.in =================================================================== --- etc/mast.sh.in (revision 4529) +++ etc/mast.sh.in (working copy) @@ -86,6 +86,7 @@ echo "

    Results


    " >> index.html Index: etc/mast-to-html.xsl =================================================================== --- etc/mast-to-html.xsl (revision 4529) +++ etc/mast-to-html.xsl (working copy) @@ -126,6 +126,9 @@ calculate_wrap(); if (previous_wrap != wrap) { for (var seqid in loadedSequences) { + //exclude inherited properties and undefined properties + if (!loadedSequences.hasOwnProperty(seqid) || loadedSequences[seqid] === undefined) continue; + var sequence = loadedSequences[seqid]; var annobox = document.getElementById(seqid + "_annotation"); var leftPos = parseInt(document.getElementById(seqid + "_dnl").firstChild.firstChild.nodeValue); @@ -871,6 +874,9 @@ var mysegs = document.getElementById(seqid + "_segs"); var lines = mysegs.value.split(/\n/); for (var i in lines) { + //exclude inherited properties and undefined properties + if (!lines.hasOwnProperty(i) || lines[i] === undefined) continue; + var line = lines[i]; var chunks = line.split(/\t/); if (chunks.length != 2) continue; @@ -881,6 +887,9 @@ var myhits = document.getElementById(seqid + "_hits"); lines = myhits.value.split(/\n/); for (var i in lines) { + //exclude inherited properties and undefined properties + if (!lines.hasOwnProperty(i) || lines[i] === undefined) continue; + var line = lines[i]; var chunks = line.split(/\t/); if (chunks.length != 6) continue; Index: etc/meme-mast.sh.in =================================================================== --- etc/meme-mast.sh.in (revision 4529) +++ etc/meme-mast.sh.in (working copy) @@ -89,6 +89,7 @@ if (-s meme.xsl) echo "
  • XSLT Stylsheet for converting MEME XML to HTML.
  • " >> index.html if (-s mast.html) echo "
  • MAST output as HTML
  • " >> index.html if (-s mast.xml) echo "
  • MAST output as XML
  • " >> index.html + if (-s mast.txt) echo "
  • MAST output as txt (format being phased out in next release)
  • " >> index.html if (-s sequences) echo "
  • input sequences
  • " >> index.html if (-s uploaded_bfile) echo "
  • background Markov model
  • " >> index.html if (-s negfile) echo "
  • Negative sequences
  • " >> index.html Index: etc/logo.js =================================================================== --- etc/logo.js (revision 4529) +++ etc/logo.js (working copy) @@ -227,6 +227,9 @@ var line_num = 0; var col_num = 0; for (line_index in lines) { + //exclude inherited properties and undefined properties + if (!lines.hasOwnProperty(line_index) || lines[line_index] === undefined) continue; + var line = lines[line_index]; if (is_empty.test(line)) { continue; @@ -248,6 +251,9 @@ col_num = 0; var parts = line.split(/\s+/); for (part_index in parts) { + //exclude inherited properties and undefined properties + if (!parts.hasOwnProperty(part_index) || parts[part_index] === undefined) continue; + var prob = parts[part_index]; if (!is_prob.test(prob)) continue; if (col_num >= this.alph_length) { @@ -368,6 +374,9 @@ function Pspm_to_string() { var str = ""; for (row_index in this.pspm) { + //exclude inherited properties and undefined properties + if (!this.pspm.hasOwnProperty(row_index) || this.pspm[row_index] === undefined) continue; + var row = this.pspm[row_index]; str += row.join("\t") + "\n"; } @@ -618,6 +627,9 @@ var lpad = 2; var pos = metrics.stack_height; for (var i in symbols) { + //exclude inherited properties and undefined properties + if (!symbols.hasOwnProperty(i) || symbols[i] === undefined) continue; + var sym = symbols[i]; var sym_height = metrics.stack_height*sym.get_scale(); var letter = metrics.get_letter_metrics(sym.get_symbol());