#!/bin/csh
# AUTHOR: Timothy L. Bailey
# CREATE DATE: July 4, 2025
# | perl -ne 'print STDERR $_;'

set pgm = $0; set pgm = $pgm:t
set command_line = "$0:t $*"
set args = ($*)
if ($#argv < 0) then
  usage:
  more << USAGE
  USAGE:
        $pgm [options]

	Convert a TSV file that describes, in simple format,
	an NCBI sequence database, into a CSV file in the 
	update-sequence-db format.

	The input format is TSV with the following columns:
		# GCF	Taxon	Strain_or_Isolate	Common_Name	FTP

	And example input file is given in etc/db_ncbi_genomes.tsv.
	To convert it to CSV format for use with update-sequence-db do:
		$pgm < db_ncbi_genomes.tsv > db_ncbi_genomes.tsv

	Reads from standard input.
	Writes to standard output.

USAGE
  if ($?dummy) then
    rm $dummy
  endif
  exit 1
endif

# So that x = ($dummy* anything*) won't give "no match" error when
# we check if anything* exists.
set dummy = $pgm.$$.dummy.tmp
touch $dummy

unlimit cputime
onintr cleanup

# get input arguments
while ("$1" != "")
  switch ($1)
  case -h:
    goto usage
  default:
    goto usage
  endsw
  shift
end

set gawk = $pgm.$$.gawk.tmp

cat << "END" > $gawk
  BEGIN {
    ftp = "https://ftp.ncbi.nlm.nih.gov";
    ncbi_url = "https://www.ncbi.nlm.nih.gov/datasets/genome/";
  }
  $0 !~ /^#/ {
    gcf = $1;
    taxon = $2;
    strain = $3;
    common_name = $4;
    if (common_name != "") { common_name = " (" common_name ")";}
    dir = $5;
    split(dir, fields, "/"); 
    file = fields[8];
    db_root = tolower(taxon) "_" gcf;
    gsub(" ", "_", db_root) 
    prot = "yes";
    nuc = "yes";
    short_seqs = "yes";
    db_menu_name = taxon " " strain common_name;
    db_long_name = "Genome and proteins for <i>" db_menu_name "</i> from <a href=" ncbi_url gcf">NCBI</a>";
    protein_url = ftp dir "/" file "_protein.faa.gz";
    nuc_url = ftp dir "/" file "_genomic.fna.gz";
    printf("%s,%s,%s,%s,%s,%s,%s,%s\n", db_root, prot, nuc, short_seqs, db_menu_name, db_long_name, protein_url, nuc_url);
    printf("\n");
  }
"END"

# Output the header for update-sequence-db.
echo "# <db_root>,<prot?>,<nuc?>,<short_seqs?>,<db_menu_name>,<db_long_name>,<protein_url>,<nuc_url>"
echo ",,,,----NCBI Genomes----,,,"
echo ""

# Sort entries by Taxon name and convert to CSV format.
set tab = "	"	# tcsh is funny about tab with sort command
sort -t "$tab" -k2 | \
  gawk -F '\t' -f $gawk

cleanup:
rm $pgm.$$.*.tmp
