From 42885df2252970ae147eb88be536490b1d890e21 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Mon, 27 May 2024 13:41:56 +0200 Subject: [PATCH 01/19] Added RagTag param config --- Snakefile | 4 +++- config.yaml | 3 +++ scripts/ragtagChromInfer.sh | 4 +++- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/Snakefile b/Snakefile index 81cf643..18daaa9 100644 --- a/Snakefile +++ b/Snakefile @@ -101,7 +101,8 @@ rule ragtag_scaffolding: threads: 8 retries: 1 params: - apppath=config["app.path"] + apppath=config["app.path"], + rtcmd=config["rt.cmd"] shell: """ bash scripts/ragtagChromInfer.sh \ @@ -110,6 +111,7 @@ rule ragtag_scaffolding: -t {threads} \ -r {input.ref} \ -q {input.fa} \ + -c {params.rtcmd} \ -o {output} if [ ! -s {output} ]; then diff --git a/config.yaml b/config.yaml index 7780f37..917fc2c 100644 --- a/config.yaml +++ b/config.yaml @@ -7,6 +7,9 @@ reference: 'CHM13.hap1.fa.gz' app.path: '<path>' # Core parameters (WIP) +# RagTag parameters +rt.cmd: '-s 1' + # Wfmash alignement parameters : leave to None if unknown wfmash.segment_length: 5000 wfmash.mapping_id: 90 diff --git a/scripts/ragtagChromInfer.sh b/scripts/ragtagChromInfer.sh index c68f0d1..fe8383f 100755 --- a/scripts/ragtagChromInfer.sh +++ b/scripts/ragtagChromInfer.sh @@ -10,13 +10,14 @@ inputquery="" # Query fasta output="" # Output fasta ## Getting arguments -while getopts "d:a:t:r:q:o:" option; do +while getopts "d:a:t:r:q:c:o:" option; do case "$option" in d) tmpdir="$OPTARG";; a) appdir="$OPTARG";; t) threads="$OPTARG";; r) inputref="$OPTARG";; q) inputquery="$OPTARG";; + c) rtcommand="$OPTARG";; o) output="$OPTARG";; \?) echo "Usage: $0 [-d tmpdir] [-a apptainer dir] [-t threads] [-r inputref] [-q inputquery] [-o output fasta] [-n pangenome name]" >&2 exit 1;; @@ -33,6 +34,7 @@ mkdir -p $tmpdir # Running ragtag scaffold apptainer run $appdir/pan1c-env.sif ragtag.py scaffold \ + $rtcommand \ -t $threads -o $tmpdir $inputref $inputquery # Renaming sequence according to naming scheme -- GitLab From 46c330885ba2f08094668eb503ef25e1a0fe84a8 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Mon, 27 May 2024 14:18:02 +0200 Subject: [PATCH 02/19] Update ragtagChromInfer.sh --- scripts/ragtagChromInfer.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/ragtagChromInfer.sh b/scripts/ragtagChromInfer.sh index fe8383f..c52476d 100755 --- a/scripts/ragtagChromInfer.sh +++ b/scripts/ragtagChromInfer.sh @@ -34,8 +34,7 @@ mkdir -p $tmpdir # Running ragtag scaffold apptainer run $appdir/pan1c-env.sif ragtag.py scaffold \ - $rtcommand \ - -t $threads -o $tmpdir $inputref $inputquery + "$rtcommand" -t $threads -o $tmpdir $inputref $inputquery # Renaming sequence according to naming scheme grep ">${ref}#chr*" -A1 $tmpdir/ragtag.scaffold.fasta | \ -- GitLab From 4522bd9b43b38b0abc3895b94b2de573cf853fc9 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Mon, 27 May 2024 14:22:23 +0200 Subject: [PATCH 03/19] Update Snakefile --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 18daaa9..14ce9eb 100644 --- a/Snakefile +++ b/Snakefile @@ -111,7 +111,7 @@ rule ragtag_scaffolding: -t {threads} \ -r {input.ref} \ -q {input.fa} \ - -c {params.rtcmd} \ + -c "{params.rtcmd}" \ -o {output} if [ ! -s {output} ]; then -- GitLab From aaefc32389ea50c9febba0bf57f13b92b6829ef9 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Tue, 4 Jun 2024 10:20:44 +0200 Subject: [PATCH 04/19] Update config.yaml --- config.yaml | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/config.yaml b/config.yaml index 917fc2c..4ddca16 100644 --- a/config.yaml +++ b/config.yaml @@ -6,11 +6,8 @@ reference: 'CHM13.hap1.fa.gz' # Directory of apptainer images (downloaded with getApps.sh) app.path: '<path>' -# Core parameters (WIP) -# RagTag parameters -rt.cmd: '-s 1' - -# Wfmash alignement parameters : leave to None if unknown +# Core parameters +# Wfmash alignement parameters : wfmash.segment_length: 5000 wfmash.mapping_id: 90 wfmash.secondary: '-k 19 -H 0.001 -X' @@ -31,6 +28,6 @@ get_PAV: 'False' get_allASM_SyRI: 'False' # All vs all get_ASMs_SyRI: 'False' # Haplotype vs Reference -# Debug options +# Debug options (Useless at the moment) debug: 'False' -- GitLab From 0f755fc75d0dd6732b83b115b53e0bb2cc208320 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Tue, 4 Jun 2024 10:21:03 +0200 Subject: [PATCH 05/19] Moved app path to variable for simplicity --- runSnakemake.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/runSnakemake.sh b/runSnakemake.sh index 4206dc0..37b8d03 100755 --- a/runSnakemake.sh +++ b/runSnakemake.sh @@ -9,8 +9,10 @@ module purge module load containers/Apptainer/1.2.5 +apppath=<path_to_pan1c-box> + # Creating DAG -apptainer run <path_to_pan1c-box>/pan1c-box.sif snakemake -c $(nproc) --dag | dot -Tsvg > workflow.svg +apptainer run $apppath/pan1c-box.sif snakemake -c $(nproc) --dag | dot -Tsvg > workflow.svg # Running the workflow -/usr/bin/time -v -o whole.run.time.log apptainer run <path_to_pan1c-box>/pan1c-box.sif snakemake -c $(nproc) +/usr/bin/time -v -o whole.run.time.log apptainer run $apppath/pan1c-box.sif snakemake -c $(nproc) -- GitLab From a445f3678951ffb27f3561b3b579f593aef1a7ef Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Tue, 4 Jun 2024 10:22:26 +0200 Subject: [PATCH 06/19] Changed some output names Added pangenome name to pggb inputs, chromosome graphs, ... --- Snakefile | 64 +++++++++++++++++++++++++++---------------------------- 1 file changed, 31 insertions(+), 33 deletions(-) diff --git a/Snakefile b/Snakefile index 14ce9eb..bed2046 100644 --- a/Snakefile +++ b/Snakefile @@ -36,10 +36,10 @@ with gzip.open("data/haplotypes/"+config['reference'], "r") as handle: def which_analysis(): # Creating a list with default analysis steps (to prevent the function from returning an empty list) analysis_inputs = [ - "output/stats/pan1c.pggb."+config['name']+".core.stats.tsv", # core stats + "output/stats/pan1c."+config['name']+".core.stats.tsv", # core stats expand("output/panacus.reports/{chromosome}.histgrowth.html", chromosome=CHRLIST), # panacus histgrowth expand("output/chrGraphs.figs/{chromosome}.1Dviz.png", chromosome=CHRLIST), # visualizations from odgi on chromosome graphs - "output/stats/pan1c.pggb."+config['name']+".chrGraph.general.stats.tsv" # chromosomes graph statistics + "output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv" # chromosomes graph statistics ] # Optionals analysis steps @@ -61,8 +61,8 @@ Rules # Main target rule rule all: input: - "output/pan1c.pggb."+config['name']+".gfa", # Final graph (main output) - "output/pan1c.pggb."+config['name']+".gfa.metadata", # Metadata for the final (also in top of gfa files as # line) + "output/pan1c."+config['name']+".gfa", # Final graph (main output) + "output/pan1c."+config['name']+".gfa.metadata", # Metadata for the final (also in top of gfa files as # line) which_analysis() """ @@ -101,8 +101,7 @@ rule ragtag_scaffolding: threads: 8 retries: 1 params: - apppath=config["app.path"], - rtcmd=config["rt.cmd"] + apppath=config["app.path"] shell: """ bash scripts/ragtagChromInfer.sh \ @@ -111,7 +110,6 @@ rule ragtag_scaffolding: -t {threads} \ -r {input.ref} \ -q {input.fa} \ - -c "{params.rtcmd}" \ -o {output} if [ ! -s {output} ]; then @@ -125,7 +123,7 @@ rule clustering: input: expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES) output: - expand('data/chrInputs/{chromosome}.fa.gz', chromosome=CHRLIST) + expand('data/chrInputs/'+config['name']+'.{chromosome}.fa.gz', chromosome=CHRLIST) threads: workflow.cores params: apppath=config["app.path"] @@ -196,7 +194,7 @@ Core section : Running PGGB """ rule create_pggb_input_syri_fig: input: - fasta='data/chrInputs/{chromosome}.fa.gz' + fasta='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz' output: fig="output/chrInput.syri.figs/{chromosome}."+config['name']+".asm.syri.png", wrkdir=directory('data/chrInput/syri/{chromosome}') @@ -235,10 +233,10 @@ rule create_pggb_input_syri_fig: # rule pggb_on_chr: # # Run pggb on a specific chromosome # input: -# fa="data/chrInputs/{chromosome}.fa.gz", +# fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz', # gzi="data/chrInputs/{chromosome}.fa.gz.gzi" # output: -# gfa="data/chrGraphs/{chromosome}.gfa" +# gfa='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' # threads: 16 # params: # pggb=config['pggb.params'], @@ -260,8 +258,8 @@ rule create_pggb_input_syri_fig: rule wfmash_on_chr: # Run wfmash on a specific chromosome input input: - fa="data/chrInputs/{chromosome}.fa.gz", - fai="data/chrInputs/{chromosome}.fa.gz.fai" + fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz', + fai='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz.fai' output: mapping="data/chrGraphs/{chromosome}/{chromosome}.wfmash.mapping.paf", aln="data/chrGraphs/{chromosome}/{chromosome}.wfmash.aln.paf" @@ -303,7 +301,7 @@ rule wfmash_on_chr: rule seqwish: # Run seqwish on alignement produced by wfmash input: - fa="data/chrInputs/{chromosome}.fa.gz", + fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz', aln=rules.wfmash_on_chr.output.aln output: "data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfa" @@ -349,7 +347,7 @@ rule odgi_postprocessing: input: rules.gfaffix_on_chr.output.gfa output: - gfa="data/chrGraphs/{chromosome}.gfa" + gfa='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' threads: 8 params: apppath=config['app.path'] @@ -398,7 +396,7 @@ rule odgi_postprocessing: rule generate_graph_list: # Generate a text file containing all created graphs input: - expand("data/chrGraphs/{chromosome}.gfa", chromosome=CHRLIST) + expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST) output: "data/chrGraphs/graphsList.txt" threads: 1 @@ -412,7 +410,7 @@ rule graph_squeeze: input: "data/chrGraphs/graphsList.txt" output: - "output/pan1c.pggb."+config['name']+".gfa" + "output/pan1c."+config['name']+".gfa" threads: 16 params: apppath=config['app.path'] @@ -428,10 +426,10 @@ rule graph_squeeze: rule graph_stats: # Using GFAstats to produce stats on every chromosome graphs input: - graph="data/chrGraphs/{chromosome}.gfa" + graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' output: - genstats="output/stats/chrGraphs/{chromosome}.general.stats.tsv", - pathstats="output/stats/chrGraphs/{chromosome}.path.stats.tsv" + genstats="output/stats/chrGraphs/"+config['name']+".{chromosome}.general.stats.tsv", + pathstats="output/stats/chrGraphs/"+config['name']+".{chromosome}.path.stats.tsv" threads: 4 params: apppath=config['app.path'] @@ -444,7 +442,7 @@ rule graph_stats: rule graph_figs: # Creating figures using odgi viz input: - graph="data/chrGraphs/{chromosome}.gfa" + graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' output: oneDviz="output/chrGraphs.figs/{chromosome}.1Dviz.png", pcov="output/chrGraphs.figs/{chromosome}.pcov.png" @@ -467,10 +465,10 @@ rule graph_figs: rule aggregate_graphs_stats: # Reading and merging all stats files from chromosome graphs into a .tsv. input: - genstats=expand("output/stats/chrGraphs/{chromosome}.general.stats.tsv", chromosome=CHRLIST) + genstats=expand("output/stats/chrGraphs/"+config['name']+".{chromosome}.general.stats.tsv", chromosome=CHRLIST) output: - genstats="output/stats/pan1c.pggb."+config['name']+".chrGraph.general.stats.tsv", - pathstats="output/stats/pan1c.pggb."+config['name']+".chrGraph.path.stats.tsv" + genstats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv", + pathstats="output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv" params: apppath=config['app.path'], panname=config['name'] @@ -484,9 +482,9 @@ rule aggregate_graphs_stats: rule final_graph_tagging: # Add metadata to the final GFA input: - graph="output/pan1c.pggb."+config['name']+".gfa", + graph="output/pan1c."+config['name']+".gfa", output: - "output/pan1c.pggb."+config['name']+".gfa.metadata" + "output/pan1c."+config['name']+".gfa.metadata" threads: 1 params: apppath=config['app.path'], @@ -501,9 +499,9 @@ rule final_graph_tagging: rule pggb_input_stats: # Produces statistics on pggb input sequences input: - flag="output/stats/pan1c.pggb."+config['name']+".chrGraph.general.stats.tsv" + flag="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv" output: - "output/stats/pan1c.pggb."+config['name']+".chrInput.stats.tsv" + "output/stats/pan1c."+config['name']+".chrInput.stats.tsv" params: apppath=config['app.path'], panname=config['name'] @@ -516,10 +514,10 @@ rule pggb_input_stats: rule core_statistics: # Aggregate chrInput, chrGraph and pggb statistics into a single tsv input: - chrInputStats="output/stats/pan1c.pggb."+config['name']+".chrInput.stats.tsv", - chrGraphStats="output/stats/pan1c.pggb."+config['name']+".chrGraph.general.stats.tsv" + chrInputStats="output/stats/pan1c."+config['name']+".chrInput.stats.tsv", + chrGraphStats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv" output: - tsv="output/stats/pan1c.pggb."+config['name']+".core.stats.tsv", + tsv="output/stats/pan1c."+config['name']+".core.stats.tsv", dir=directory("output/pggb.usage.figs") params: apppath=config['app.path'], @@ -561,9 +559,9 @@ rule get_pav: rule panacus_stats: # Produces panacus reports for a chromosome graph input: - graph="data/chrGraphs/{chromosome}.gfa" + graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' output: - html="output/panacus.reports/{chromosome}.histgrowth.html" + html='output/panacus.reports/'+config['name']+'.{chromosome}.histgrowth.html' params: apppath=config['app.path'], panname=config['name'], -- GitLab From fe9fe10ea5fe3c189b63ba1aedbbe9ec81b7b0a5 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Tue, 4 Jun 2024 10:27:07 +0200 Subject: [PATCH 07/19] Update Snakefile --- Snakefile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Snakefile b/Snakefile index bed2046..9b2d349 100644 --- a/Snakefile +++ b/Snakefile @@ -37,8 +37,8 @@ def which_analysis(): # Creating a list with default analysis steps (to prevent the function from returning an empty list) analysis_inputs = [ "output/stats/pan1c."+config['name']+".core.stats.tsv", # core stats - expand("output/panacus.reports/{chromosome}.histgrowth.html", chromosome=CHRLIST), # panacus histgrowth - expand("output/chrGraphs.figs/{chromosome}.1Dviz.png", chromosome=CHRLIST), # visualizations from odgi on chromosome graphs + expand("output/panacus.reports/"+config['name']+".{chromosome}.histgrowth.html", chromosome=CHRLIST), # panacus histgrowth + expand("output/chrGraphs.figs/"+config['name']+".{chromosome}.1Dviz.png", chromosome=CHRLIST), # visualizations from odgi on chromosome graphs "output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv" # chromosomes graph statistics ] @@ -196,7 +196,7 @@ rule create_pggb_input_syri_fig: input: fasta='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz' output: - fig="output/chrInput.syri.figs/{chromosome}."+config['name']+".asm.syri.png", + fig="output/chrInput.syri.figs/"+config['name']+".{chromosome}.asm.syri.png", wrkdir=directory('data/chrInput/syri/{chromosome}') threads: 8 params: @@ -444,8 +444,8 @@ rule graph_figs: input: graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' output: - oneDviz="output/chrGraphs.figs/{chromosome}.1Dviz.png", - pcov="output/chrGraphs.figs/{chromosome}.pcov.png" + oneDviz="output/chrGraphs.figs/"+config['name']+".{chromosome}.1Dviz.png", + pcov="output/chrGraphs.figs/"+config['name']+".{chromosome}.pcov.png" threads: 4 params: apppath=config['app.path'], -- GitLab From 706e8d6e9433bc4c71ccf8c19d6e4851659d96ef Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Tue, 4 Jun 2024 18:07:41 +0200 Subject: [PATCH 08/19] Update Snakefile Compressing temporary files in order to save space --- Snakefile | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/Snakefile b/Snakefile index 9b2d349..9c61c97 100644 --- a/Snakefile +++ b/Snakefile @@ -261,8 +261,8 @@ rule wfmash_on_chr: fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz', fai='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz.fai' output: - mapping="data/chrGraphs/{chromosome}/{chromosome}.wfmash.mapping.paf", - aln="data/chrGraphs/{chromosome}/{chromosome}.wfmash.aln.paf" + mapping=temp("data/chrGraphs/{chromosome}/{chromosome}.wfmash.mapping.paf"), + aln=temp("data/chrGraphs/{chromosome}/{chromosome}.wfmash.aln.paf") threads: 16 params: apppath=config['app.path'], @@ -296,15 +296,22 @@ rule wfmash_on_chr: --invert-filtering \ 1> {output.aln} \ 2> >(tee {log.cmd_aln} >&2) + + # Compressing + apptainer run --app bgzip {params.apppath}/PanGeTools.sif \ + -@ {threads} -k {output.mapping} + + apptainer run --app bgzip {params.apppath}/PanGeTools.sif \ + -@ {threads} -k {output.aln} """ - + rule seqwish: # Run seqwish on alignement produced by wfmash input: fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz', aln=rules.wfmash_on_chr.output.aln output: - "data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfa" + temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfa") threads: 8 params: apppath=config['app.path'], @@ -320,6 +327,10 @@ rule seqwish: {params.seqwish} -t {threads} \ --temp-dir $(dirname {output}) -P 2>&1 | \ tee {log.cmd} + + # Compressing + apptainer run --app bgzip {params.apppath}/PanGeTools.sif \ + -@ {threads} -k {output} """ rule gfaffix_on_chr: @@ -327,7 +338,7 @@ rule gfaffix_on_chr: input: rules.seqwish.output output: - gfa="data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.gfa", + gfa=temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.gfa"), transform="data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.transform.txt" threads: 1 params: @@ -340,6 +351,10 @@ rule gfaffix_on_chr: apptainer exec {params.apppath}/pggb.sif gfaffix \ {input} -o {output.gfa} -t {output.transform} \ > /dev/null + + # Compressing + apptainer run --app bgzip {params.apppath}/PanGeTools.sif \ + -@ {threads} -k {output.gfa} """ rule odgi_postprocessing: @@ -347,7 +362,7 @@ rule odgi_postprocessing: input: rules.gfaffix_on_chr.output.gfa output: - gfa='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' + gfa=temp('data/chrGraphs/'+config['name']+'.{chromosome}.gfa') threads: 8 params: apppath=config['app.path'] @@ -389,6 +404,13 @@ rule odgi_postprocessing: view -i $OGfile.unchoped.sorted.og -g \ 1> {output.gfa} \ 2> >(tee {log.cmd_view} >&2) + + ## Removing .og files for space savings + rm $(dirname {input})/*.og + + # Compressing + apptainer run --app bgzip {params.apppath}/PanGeTools.sif \ + -@ {threads} -k {output.gfa} """ ## ---------------------------------------- -- GitLab From 07b86f0786299cfbf0b5953229d5ddd800e31272 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Wed, 5 Jun 2024 09:35:20 +0200 Subject: [PATCH 09/19] Update ragtagChromInfer.sh Compressing RagTag temporary folder --- scripts/ragtagChromInfer.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/ragtagChromInfer.sh b/scripts/ragtagChromInfer.sh index c52476d..05e0d57 100755 --- a/scripts/ragtagChromInfer.sh +++ b/scripts/ragtagChromInfer.sh @@ -46,3 +46,8 @@ apptainer run --app bgzip $appdir/PanGeTools.sif \ # Moving fa.gz to output dir mv $tmpdir/${sample}.ragtagged.fa.gz $output + +# Compressing temporary files +tar --remove-files -cf $tmpdir.tar $tmpdir +apptainer run --app bgzip $appdir/PanGeTools.sif \ + -@ $threads $tmpdir.tar -- GitLab From b8ea708b3cdcbdbe5145774a0360560ea4144b74 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Wed, 5 Jun 2024 09:35:58 +0200 Subject: [PATCH 10/19] Update Snakefile Adding metadata to chromosome graphs --- Snakefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Snakefile b/Snakefile index 9c61c97..ea108d3 100644 --- a/Snakefile +++ b/Snakefile @@ -408,6 +408,12 @@ rule odgi_postprocessing: ## Removing .og files for space savings rm $(dirname {input})/*.og + ## Adding metadata + python scripts/getTags.py \ + --appdir {params.apppath} --config-file config.yaml \ + > "$(dirname {input})/metadata.txt" + sed -i "/^H/r $(dirname {input})/metadata.txt" {output.gfa} + # Compressing apptainer run --app bgzip {params.apppath}/PanGeTools.sif \ -@ {threads} -k {output.gfa} -- GitLab From b2f712f58ec1cab4073d1de1ab0994cb1e488808 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Wed, 5 Jun 2024 10:07:47 +0200 Subject: [PATCH 11/19] Update ragtagChromInfer.sh --- scripts/ragtagChromInfer.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ragtagChromInfer.sh b/scripts/ragtagChromInfer.sh index 05e0d57..9afa538 100755 --- a/scripts/ragtagChromInfer.sh +++ b/scripts/ragtagChromInfer.sh @@ -34,7 +34,7 @@ mkdir -p $tmpdir # Running ragtag scaffold apptainer run $appdir/pan1c-env.sif ragtag.py scaffold \ - "$rtcommand" -t $threads -o $tmpdir $inputref $inputquery + -t $threads -o $tmpdir $inputref $inputquery # Renaming sequence according to naming scheme grep ">${ref}#chr*" -A1 $tmpdir/ragtag.scaffold.fasta | \ -- GitLab From cbe527a20fd40401fab36bcc483c44f08646a481 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Wed, 5 Jun 2024 10:13:36 +0200 Subject: [PATCH 12/19] Updated inputClustering Added pangenome name to outputed fasta --- Snakefile | 5 +++-- scripts/inputClustering.py | 8 +++++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/Snakefile b/Snakefile index ea108d3..fe9b83c 100644 --- a/Snakefile +++ b/Snakefile @@ -126,12 +126,13 @@ rule clustering: expand('data/chrInputs/'+config['name']+'.{chromosome}.fa.gz', chromosome=CHRLIST) threads: workflow.cores params: - apppath=config["app.path"] + apppath=config["app.path"], + panname=config["name"] shell: """ mkdir -p $(dirname {output[0]}) apptainer run {params.apppath}/pan1c-env.sif python scripts/inputClustering.py \ - --fasta {input} --output $(dirname {output[0]}) + --fasta {input} --output $(dirname {output[0]}) --panname {params.panname} for file in $(dirname {output[0]})/*.fa; do apptainer run --app bgzip {params.apppath}/PanGeTools.sif -@ {threads} $file done diff --git a/scripts/inputClustering.py b/scripts/inputClustering.py index 6284eb4..49bdb13 100644 --- a/scripts/inputClustering.py +++ b/scripts/inputClustering.py @@ -33,6 +33,12 @@ arg_parser.add_argument( required = True, help = "Output directory" ) +arg_parser.add_argument( + "--panname", + dest = "panname", + required = True, + help = "Pangenome name" + ) arg_parser.add_argument( "--debug", "-d", @@ -88,6 +94,6 @@ if args.debug: print(chrSeq.keys()) # Writing chromosome specific fasta file for chrName in chrSeq.keys(): - with open(os.path.join(args.outdir, f"{chrName}.fa"), "w") as output_handle: + with open(os.path.join(args.outdir, f"{args.panname}.{chrName}.fa"), "w") as output_handle: fasta_out = FastaIO.FastaWriter(output_handle, wrap=None) fasta_out.write_file(chrSeq[chrName]) \ No newline at end of file -- GitLab From 93eb55ed5677e5c9b5abc80b84a1a35c716c04fa Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Wed, 5 Jun 2024 10:17:12 +0200 Subject: [PATCH 13/19] Update Snakefile --- Snakefile | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Snakefile b/Snakefile index fe9b83c..c809065 100644 --- a/Snakefile +++ b/Snakefile @@ -461,11 +461,14 @@ rule graph_stats: pathstats="output/stats/chrGraphs/"+config['name']+".{chromosome}.path.stats.tsv" threads: 4 params: - apppath=config['app.path'] + apppath=config['app.path'], + panname=config['name'] shell: """ apptainer run --app gfastats {params.apppath}/PanGeTools.sif \ - -g {input.graph} -P -o $(dirname {output.genstats})/{wildcards.chromosome} -t {threads} + -g {input.graph} -P \ + -o $(dirname {output.genstats})/{params.panname}.{wildcards.chromosome} \ + -t {threads} """ rule graph_figs: -- GitLab From 3f3c2a9f02afba3586f79f185854643f987ffe94 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Wed, 5 Jun 2024 10:34:58 +0200 Subject: [PATCH 14/19] Updated Panacus part --- Snakefile | 2 +- scripts/getPanacusHG.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Snakefile b/Snakefile index c809065..c22aa76 100644 --- a/Snakefile +++ b/Snakefile @@ -604,7 +604,7 @@ rule panacus_stats: bash scripts/getPanacusHG.sh \ -g {input.graph} \ -r $(basename {params.refname} .fa.gz) \ - -d data/chrGraphs/$(basename {input.graph} .gfa) \ + -d data/chrGraphs/{wildcards.chromosome} \ -o {output.html} \ -a {params.apppath} \ -t {threads} diff --git a/scripts/getPanacusHG.sh b/scripts/getPanacusHG.sh index 41a2222..7f3ddb9 100755 --- a/scripts/getPanacusHG.sh +++ b/scripts/getPanacusHG.sh @@ -25,7 +25,7 @@ while getopts "g:r:a:t:d:o:" option; do done # Getting chromosome name -chrname=$(basename ${gfa} .gfa) +chrname=$(basename ${gfa} .gfa | cut -d'.' -f2) ref=$(echo $refname | sed 's/.hap/#/') # Getting paths in chromosome graph -- GitLab From 6e5e5af38f8c14ef4a938a04b506da76b9bae34b Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Wed, 5 Jun 2024 10:50:26 +0200 Subject: [PATCH 15/19] Update Snakefile --- Snakefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Snakefile b/Snakefile index c22aa76..e0484d3 100644 --- a/Snakefile +++ b/Snakefile @@ -437,7 +437,8 @@ rule generate_graph_list: rule graph_squeeze: # Using odgi to merge every subgraphs into a final one input: - "data/chrGraphs/graphsList.txt" + glist="data/chrGraphs/graphsList.txt", + graphs=expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST) output: "output/pan1c."+config['name']+".gfa" threads: 16 @@ -446,7 +447,7 @@ rule graph_squeeze: shell: """ apptainer run --app odgi {params.apppath}/PanGeTools.sif \ - squeeze -t {threads} -O -P -f {input} -o {output}.og + squeeze -t {threads} -O -P -f {input.glist} -o {output}.og apptainer run --app odgi {params.apppath}/PanGeTools.sif \ view -t {threads} -P -i {output}.og -g > {output} rm {output}.og -- GitLab From 525c0047d6dc1b7fa340d9c3079323bd99a7d23d Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Wed, 5 Jun 2024 10:59:13 +0200 Subject: [PATCH 16/19] Update chrInputStats.py --- scripts/chrInputStats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/chrInputStats.py b/scripts/chrInputStats.py index e232f9f..8b3015b 100644 --- a/scripts/chrInputStats.py +++ b/scripts/chrInputStats.py @@ -59,7 +59,7 @@ seqDict = {} for filename in args.fastafiles: # Getting chromosome name from fasta filename - chrName = os.path.basename(filename).split(".fa.gz")[0] + chrName = os.path.basename(filename).split(".fa.gz")[0].split('.')[-1] # Reading bgzip fasta file and adding records to seqDict with gzip.open(filename, "rt") as handle: -- GitLab From 0e3f1f1d99718e759d35ea334971ce9b9898e7b3 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Wed, 5 Jun 2024 11:12:15 +0200 Subject: [PATCH 17/19] Update chrStatsAggregation.py --- scripts/chrStatsAggregation.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/scripts/chrStatsAggregation.py b/scripts/chrStatsAggregation.py index af9d122..be2cf95 100644 --- a/scripts/chrStatsAggregation.py +++ b/scripts/chrStatsAggregation.py @@ -54,6 +54,7 @@ for file in fileList: # Getting filename (i.e. chromosome) filename, extension = os.path.splitext(os.path.splitext(os.path.basename(file))[0]) + chrname = filename.split('.')[-1] statType = filename.split('.')[-1] # Getting general or path stats @@ -65,9 +66,16 @@ for file in fileList: if len(stats[statType]) != 0: filelines = filelines[1:] + # Removing chromosome name from each line + filelines = [ + '\t'.join(line.split('\t')[1:]) + for line in filelines + ] + # Adding pangenome name as first column filelines = [ - f"{args.panname}\t{line}" for line in filelines + f"{args.panname}\t{chrname}\t{line}" + for line in filelines ] # Else, rework header to add pangenome name else : -- GitLab From 06521322d8c5365763ea7e5228912e7f38602c9d Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 7 Jun 2024 14:36:57 +0200 Subject: [PATCH 18/19] Update chrStatsAggregation.py --- scripts/chrStatsAggregation.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/scripts/chrStatsAggregation.py b/scripts/chrStatsAggregation.py index be2cf95..3ac2b10 100644 --- a/scripts/chrStatsAggregation.py +++ b/scripts/chrStatsAggregation.py @@ -54,7 +54,7 @@ for file in fileList: # Getting filename (i.e. chromosome) filename, extension = os.path.splitext(os.path.splitext(os.path.basename(file))[0]) - chrname = filename.split('.')[-1] + chrname = filename.split('.')[-2] statType = filename.split('.')[-1] # Getting general or path stats @@ -85,7 +85,12 @@ for file in fileList: # Adding pangenome name as first column filelines[1:] = [ - f"{args.panname}\t{line}" for line in filelines[1:] + '\t'.join(line.split('\t')[1:]) + for line in filelines[1:] + ] + + filelines[1:] = [ + f"{args.panname}\t{chrname}\t{line}" for line in filelines[1:] ] # Redirecting to the correct list -- GitLab From df61e64a991d0614b4e1951d07438918b36ef5d8 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Tue, 11 Jun 2024 18:36:26 +0200 Subject: [PATCH 19/19] Small fixes --- Snakefile | 2 +- getApps.sh | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Snakefile b/Snakefile index e0484d3..2968040 100644 --- a/Snakefile +++ b/Snakefile @@ -15,7 +15,7 @@ Main variables used in the workflow SAMPLES = np.unique([ os.path.basename(f).split('.fa')[0] for f in os.listdir("data/haplotypes/") - if re.search(r".fa", f) + if re.search(r"\.fa", f) ]) # Retrieving the list of haplotypes excluding the reference diff --git a/getApps.sh b/getApps.sh index 76fed73..1ba6cc9 100755 --- a/getApps.sh +++ b/getApps.sh @@ -15,7 +15,7 @@ while getopts "a:" option; do done # Script -apptainer build $appdir/PanGeTools.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangetools/pangetools:latest -apptainer build $appdir/pan1c-env.sif oras://registry.forgemia.inra.fr/alexis.mergez/pan1capps/pan1cenv:latest -apptainer build $appdir/pggb.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangratools/pggb:latest -apptainer build $appdir/pan1c-box.sif oras://registry.forgemia.inra.fr/alexis.mergez/pan1capps/pan1cbox:latest +apptainer pull $appdir/PanGeTools.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangetools/pangetools:latest +apptainer pull $appdir/pan1c-env.sif oras://registry.forgemia.inra.fr/alexis.mergez/pan1capps/pan1cenv:latest +apptainer pull $appdir/pggb.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangratools/pggb:latest +apptainer pull $appdir/pan1c-box.sif oras://registry.forgemia.inra.fr/alexis.mergez/pan1capps/pan1cbox:latest -- GitLab