Step 0: Bootstrap

export EFFUSION_DIR=$HOME/remote_projects/effusion
export PYTHON=$EFFUSION_DIR/venv/bin/python
export ARCHIVE_DIR=/mnt/fast/effusion/1704.archive
export DATA_DIR=/mnt/fast/effusion/1704.data
# export DATA_DIR=/mnt/fast/effusion/1704.check

export CLUSTER_PYTHON=/netapp/home/jeff/venv/bin/python
export CLUSTER_DATA_DIR=/netapp/home/jeff/effusion/1704.data

Step 1a: Install Effusion on workstation

Step 1b: Install Effusion on cluster

rsync -av --exclude venv --exclude .git --delete /Users/jeff/Workspaces/effusion pass1.compbio.ucsf.edu:software/

Step 2: Configure for each system

cp $EFFUSION_DIR/etc/workstation.cfg $HOME/etc/effusion.cfg

or

cp $EFFUSION_DIR/etc/qb3_cluster.cfg $HOME/etc/effusion.cfg`

# [edit as needed]
# [confirm sqlite3 config] [todo: do this programmatically]
    cat ~/.sqliterc`
    # [confirm "PRAGMA page_size = 4096;"]

Step 3: Pre-process data and calculate params on workstation

mkdir $ARCHIVE_DIR
mkdir $DATA_DIR
cd $DATA_DIR

Sequence data

wget ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/goa_uniprot_all.gaf.gz -P $ARCHIVE_DIR
wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz -P $ARCHIVE_DIR
wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz -P $ARCHIVE_DIR
gunzip -c $ARCHIVE_DIR/uniprot_sprot.fasta.gz $ARCHIVE_DIR/uniprot_trembl.fasta.gz | ~/Applications/ncbi-blast-2.6.0+/bin/makeblastdb -dbtype prot -parse_seqids -out uniprot/uniprot -title uniprot
zcat $ARCHIVE_DIR/uniprot_sprot.fasta.gz $ARCHIVE_DIR/uniprot_trembl.fasta.gz | ~/Applications/diamond makedb -d diamond_uniprot

Functional association data

ln -sf /media/jeff/Bytes/agca/20160804.archive/protein.aliases.v10.txt.gz $ARCHIVE_DIR
ln -sf /media/jeff/Bytes/agca/20160804.archive/protein.links.v10.txt.gz $ARCHIVE_DIR
$PYTHON $EFFUSION_DIR/bin/make_string_db.py --min-string-score 900 $ARCHIVE_DIR/protein.links.v10.txt.gz $ARCHIVE_DIR/protein.aliases.v10.txt.gz string.db

GO / GOA data

wget http://purl.obolibrary.org/obo/go/go-basic.obo -P $ARCHIVE_DIR
wget ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/goa_uniprot_all.gaf.gz -P $ARCHIVE_DIR
ln -sf $ARCHIVE_DIR/go-basic.obo go.obo  # link the ontology to use

# the next few lines are new attempt to use goslim for testing
# wget http://www.geneontology.org/ontology/subsets/goslim_generic.obo -P $ARCHIVE_DIR # download
# wget http://build.berkeleybop.org/userContent/owltools/owltools  # install owltools for converting full annotations to go-slim annotations. needs java9. java9 ppa down. head of owltools is bash script that shows command to use non-system java
# /usr/local/jdk-9.0.4/bin/java -Xms2G -Xmx6G -DentityExmpansionLimit=4086000 -Djava.awt.headless=true -classpath ./owltools owltools.cli.CommandLineInterface go-basic.obo --gaf annotations.gaf --map2slim --subset goslim_generic --write-gaf annotations.mapped.gaf
# ln -sf $DATA_DIR_MINI/goslim_generic.obo go.obo  # link the ontology to use

Phase dependent data & params

echo '{"catalytic_id": "GO:0003824", "cutoff_dates": {"training": "2014-12-31", "validation": "2015-12-31", "test": "2016-12-31"}}' > data.json
#$PYTHON $EFFUSION_DIR/bin/make_goa_db.py --limit 10000 $DATA_DIR $ARCHIVE_DIR/goa_uniprot_all.gaf.gz $DATA_DIR/goa-sample.db
$PYTHON $EFFUSION_DIR/bin/make_goa_db.py $DATA_DIR $ARCHIVE_DIR/goa_uniprot_all.gaf.gz $DATA_DIR/goa.db
chmod ugo-w goa.db
$PYTHON $EFFUSION_DIR/bin/make_goa_blast_db.py $DATA_DIR $DATA_DIR
$PYTHON $EFFUSION_DIR/bin/make_protein_network_parents_db.py $DATA_DIR $DATA_DIR/protein_network_parents.db
chmod ugo-w protein_network_parents.db
$PYTHON $EFFUSION_DIR/bin/choose_evaluation_proteins.py --mf_only --ida_only $DATA_DIR 1000000 $DATA_DIR

Copy to cluster

rsync -av --dry-run --exclude "preds*" --exclude "networks*" $DATA_DIR/ pass1.compbio.ucsf.edu:$CLUSTER_DATA_DIR
# rsync -av --exclude "preds*" --exclude "networks*" $DATA_DIR/ pass1.compbio.ucsf.edu:$CLUSTER_DATA_DIR