thesis.latex

%-------------------------------------
%
%	Dissertation
%
%-------------------------------------

\documentclass[a4paper,twoside,12pt,openright]{book}
\usepackage[lmargin=142pt,rmargin=95pt,tmargin=127pt,bmargin=123pt]{geometry}

%---- packages ----

\usepackage[hyphens]{url}
\usepackage[margin=10pt,font=small,labelfont=bf]{caption}

% fancy page layout
\usepackage{fancyhdr}

% toc formating
\usepackage{tocbibind}
\usepackage[titles]{tocloft}
\renewcommand{\cftchapfont}{%
	\fontsize{11}{13}\usefont{OT1}{phv}{bx}{n}\selectfont
}

% language
\usepackage[shorthands=true,ngerman,english]{babel}  % use babel because polyglossia gives errors
\usepackage{csquotes}

% ams-latex
\usepackage{amssymb}
\usepackage{amsmath}

% graphics
%\usepackage[usenames,dvipsnames]{color}

% tables
\usepackage{longtable}
%\usepackage[table]{xcolor}
\usepackage{tabularx}
\usepackage{multirow}
\usepackage{booktabs}
\usepackage{arydshln}
\usepackage{lscape}
\usepackage{array}
\usepackage{subscript}
\usepackage{booktabs}

% links
\usepackage{pdflscape}
\usepackage{tabu}

% formating of urls
\usepackage{url}

% chapter headers
\usepackage[Sonny]{fncychap}
\ChNameVar{\large\sf}
\ChNumVar{\huge}
\ChTitleVar{\Huge\sf}
\ChRuleWidth{0.5pt}
\ChNameUpperCase
\usepackage{emptypage}  % remove header on empty pages

% line spacing
\usepackage{setspace}

% abbreviations
\usepackage[intoc]{nomencl}
\renewcommand{\nomname}{List of Abbreviations}
\setlength{\nomitemsep}{-\parsep}
\makenomenclature
% run additionally: makeindex main.nlo -s nomencl.ist -o main.nls
% 4 markers used: 1_v (virus), 2_s (viral strains), 3_m (methodology), 4_o (others)
% additionally structure the computed tex file into sections manually

% include whole pdfs
\usepackage{pdfpages}
\usepackage{textcomp}

%---- page layout ----

\setlength{\paperheight}{297mm}
\setlength{\paperwidth}{210mm}

\setlength{\hoffset}{0.00cm}
\setlength{\voffset}{0.00cm}

\setlength{\oddsidemargin}{1.5cm}
\setlength{\evensidemargin}{0cm}
\setlength{\topmargin}{1mm}
\setlength{\headheight}{1.36cm}
\setlength{\headsep}{1.00cm}
\setlength{\textheight}{20.84cm}
\setlength{\textwidth}{14.5cm}
\setlength{\marginparsep}{1mm}
\setlength{\marginparwidth}{3cm}
\setlength{\footskip}{2.36cm}


%---- fancy page layout ----
\pagestyle{fancy}
\renewcommand{\chaptermark}[1]{\markboth{\MakeUppercase{Chapter \thechapter. #1 }}{}}
\renewcommand{\sectionmark}[1]{\markright{\thesection\ #1}}
\fancyhf{}
\fancyhead[RO]{\rightmark \hspace{0.5cm} \textbf{\thepage}}
\fancyhead[LE]{\textbf{\thepage} \hspace{0.5cm} \leftmark}
\renewcommand{\headrulewidth}{0.5pt}
\renewcommand{\footrulewidth}{0pt}
\addtolength{\headheight}{0.5pt}
\fancypagestyle{plain}{
  \fancyhead{}
  \renewcommand{\headrulewidth}{0pt}
}

%---- references ----
%\renewcommand{\refname}{References}
%\renewcommand{\bibname}{References}

%---- own command definitions ----
%\input{config/definitions.own}

% turn of those nasty overfull and underfull hboxes and vboxes
\hbadness=10000
\hfuzz=1pt
\vbadness=10000
\vfuzz=1pt

% control pdf output settings
%\pdfminorversion=4  # PDF 1.4 is basis for PDF/A 1b
%\pdfobjcompresslevel=0  # not allowed in PDF/A 1b
%\pdfcompresslevel=0  # supress LZW stream compression for PDF/A 1b

\usepackage{glossaries}
%\input{content/glossary.tex}
\makeglossary
\usepackage{subcaption}
\usepackage{footnote}

% control hyphenation in titles
\usepackage[raggedright]{titlesec}

% add appendix
\usepackage[toc,page]{appendix}

% set paragraph spacing
%\usepackage{parskip}
\usepackage[iso]{datetime}
\usepackage[T1]{fontenc}
\usepackage{lmodern}
\usepackage{amssymb,amsmath}
\usepackage{bm}
\usepackage{ifxetex,ifluatex}
\usepackage{fixltx2e} % provides \textsubscript
% use upquote if available, for straight quotes in verbatim environments
\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex
  \usepackage[utf8]{inputenc}
\else % if luatex or xelatex
  \ifxetex
    \usepackage{mathspec}
    \usepackage{xltxtra,xunicode}
  \else
    \usepackage{fontspec}
  \fi
  \defaultfontfeatures{Mapping=tex-text,Scale=MatchLowercase}
  \newcommand{\euro}{€}
\fi
% use microtype if available
\IfFileExists{microtype.sty}{\usepackage{microtype}}{}
\usepackage{longtable,booktabs}

% pandoc if(graphics) deactivated because set by pandoc for some reason
\usepackage[export]{adjustbox}  % loads graphicx but we can define padding around figures
\let\Oldincludegraphics\includegraphics
{%
 \catcode`\@=11\relax%
 \gdef\includegraphics{\@ifnextchar[{\Oldincludegraphics}{\Oldincludegraphics[padding=15 15 15 15,max width=\linewidth]}}%
}%

\ifxetex
  \usepackage[setpagesize=false, % page size defined by xetex
              unicode=false, % unicode breaks when used with xetex
              xetex,
              pdfa]{hyperref}
\else
  \usepackage[unicode=true,pdfa]{hyperref}
\fi
\hypersetup{breaklinks=true,
            bookmarks=true,
            bookmarksdepth=3,
            pdfauthor={},
            pdftitle={},
            colorlinks=true,
            citecolor=black,
            urlcolor=black,
            linkcolor=black,
            pdfborder={0 0 0}}
\urlstyle{same}  % don't use monospace font for urls
\setlength{\parindent}{0pt}
\setlength{\parskip}{6pt plus 2pt minus 1pt}
\setlength{\emergencystretch}{3em}  % prevent overfull lines
\setcounter{secnumdepth}{0}

%redefine figure placement rules
\usepackage{float}
\let\origfigure=\figure
\let\endorigfigure=\endfigure
\renewenvironment{figure}[1][2]{%
  %\expandafter\origfigure\expandafter[H]
  \expandafter\origfigure\expandafter[!htb] % use H for direct placement
}{%
  \endorigfigure
}

% place figures and tables in the same sections/subsections
\usepackage{placeins}
\let\origsection=\section
\let\endorigsection=\endsection
\renewenvironment{section}[1][]{%
  \origsection
}{%
  \endorigsection
  \FloatBarrier
}
\let\origsubsection=\subsection
\let\endorigsubsection=\endsubsection
\renewenvironment{subsection}[1][]{%
  \origsubsection
}{%
  \endorigsubsection
  \FloatBarrier
}
\let\origsubsubsection=\subsubsection
\let\endorigsubsubsection=\endsubsubsection
\renewenvironment{subsubsection}[1][]{%
  \origsubsubsection
}{%
  \endorigsubsubsection
  \FloatBarrier
}
\let\origparagraph=\paragraph
\let\endorigparagraph=\endparagraph
\renewenvironment{paragraph}[1][]{%
  \origparagraph
}{%
  \endorigparagraph
  \FloatBarrier
}
\let\origsubparagraph=\subparagraph
\let\endorigsubparagraph=\endsubparagraph
\renewenvironment{subparagraph}[1][]{%
  \origsubparagraph
}{%
  \endorigsubparagraph
  \FloatBarrier
}

% define tightlist
\providecommand{\tightlist}{%
  \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
% bold title used in abstract
\newcommand{\smhd}[1]{{\Large\bf\usefont{OT1}{phv}{b}{n}\selectfont #1}}
\begin{document}

\begin{titlepage}
\newgeometry{top=100pt,bottom=100pt,right=35pt,left=55pt}
\thispagestyle{empty}
\vspace*{\fill}
\includegraphics[width=\textwidth]{cover/coverimage.pdf}
\vspace*{\fill}

\newpage
\restoregeometry
\thispagestyle{empty}
%\begin{center}
\small{
\input{cover/description.latex}
}
%\end{center}
\end{titlepage}
\restoregeometry

\frontmatter
\setlength{\parindent}{0pt}
\renewcommand{\baselinestretch}{1.00}\normalsize

%-------------------------------------
%	Title page
%-------------------------------------

\pagestyle{empty}
%\vspace*{0.3cm}
\begin{center}
\huge{
 Computational Methods\\
 for Taxonomic Annotation\\
 and Genome Reconstruction\\
 in Metagenomics
 \par
}

\onehalfspacing

\vspace*{2.5cm}

\large
Kumulative Dissertation

\vspace*{1.5cm}

zur\\
Erlangung des Doktorgrades\\
der Mathematisch-Naturwissenschaftlichen Fakultät \\
der Heinrich-Heine-Universität Düsseldorf \\


\vspace*{1.0cm}

vorgelegt von\\

\vspace*{0.5cm}

\textbf{Johannes Dröge}\\
aus Halle (Westf.)

\vspace*{3cm}

Düsseldorf, 2017-03-31


\end{center}

\newpage

aus dem Institut für Informatik\\
der Heinrich-Heine-Universität Düsseldorf

\vspace*{10cm}

Gedruckt mit der Genehmigung der\\
Mathemathisch-Naturwissenschaftlichen Fakultät der\\
Heinrich-Heine-Universität Düsseldorf

\vspace*{3cm}

\begin{tabularx}{\textwidth}{ X X }
Referent: & Prof. Dr. Alice C. McHardy \\
Koreferent: & Prof. Dr. Martin J. Lercher\\
 & \\
Tag der mündlichen Prüfung: &  2017-07-17 \\
\end{tabularx}
%-------------------------------------
%	Selbstaendigkeitserklaerung
%-------------------------------------

\newpage
\pagestyle{plain}

% \vspace*{2cm}
{
	\selectlanguage{ngerman}

	\smhd{Selbstst{\"a}ndigkeitserkl{\"a}rung}  % stupid command cannot take utf8

	\vspace*{1cm} {\parindent=0pt
	Hiermit erkläre ich, dass ich die vorliegende Dissertation eigenständig und ohne fremde Hilfe angefertig habe. Arbeiten Dritter wurden entsprechend zitiert. Diese Dissertation wurde bisher in dieser oder ähnlicher Form noch bei keiner anderen Institution eingereicht. Ich habe bisher keine erfolglosen Promotionsversuche unternommen.

	\vspace{1cm}

	 Düsseldorf , den ~ \makebox[3cm]{\dotfill} \hfill\hfill\hfill \makebox[5cm]{\dotfill}
		\\
	\small\rule{0pt}{0pt} \hfill(Johannes Dröge)
	}
}

\vspace*{8cm}

\smhd{Statement of authorship}

\vspace*{1cm} {\parindent=0pt
I hereby certify that this dissertation is the result of my own work. No other person's work has been used without due acknowledgement. This dissertation has not been submitted in the same or similar form to other institutions. I have not previously failed a doctoral examination procedure.}

\cleardoublepage
%-------------------------------------
%	Abstract English
%-------------------------------------

\smhd{Summary}

\vspace*{0.5cm}
\onehalfspacing
Microbial communities can be found in almost every place, from biogas
reactors over deep sea vents, the surface of plant leaves and roots, to
the human body, which hosts a plethora of foreign cells in its digestion
system. These communities may consist of thousands upon thousands of
microorganisms, including bacteria, archaea, algae and fungi, which
coexist within their habitats but which cannot simply be cultivated and
studied due to their complex mutual dependencies and environmental
requirements. Metagenomics is a field dedicated to the genetic analysis
of such communities. The genes of their members enable their survival,
for instance by making nutrients accessible, by neutralizing toxic
compounds or by allowing symbiosis with other organisms. Through the use
of nucleotide sequencing technologies, this genetic diversity can be
explored and rendered usable, for instance in the form of new
antibiotics or as enzymes in biotechnology. Apart from its considerable
economic potential, metagenomic approaches lead to a fundamentally
improved understanding of the microbial processes on earth.\\
With current technology, it is not directly possible to sequence
contiguous genomes from microbial communities. Instead, short sequences,
called reads, are produced, which need to be assembled into genes and
longer genome sequences using computer programs. Depending on the size
and complexity of the metagenome, this task can be very difficult. This
thesis describes two methods for assigning metagenomic sequences to
taxonomic groups or genomes. The results can be used to analyze the
genes, and the corresponding proteins and functions, within their
phylogenetic and genetic context to gain better insight into the
functioning of individual organisms and the microbial community.\\
Our first method, \emph{taxator-tk}, assigns nucleotide sequences from
metagenomes to corresponding taxa and approaches two challenges: the
precise prediction of taxa and the application to datasets, which are
constantly growing due to the rapid progress in DNA sequencing. Since
annotation methods such as \emph{taxator-tk}, which require similarity
to known genomes, spend a considerable part of their runtime for
sequence comparison, our algorithm exploits the underlying phylogenetic
structure for similar gene sequences to efficiently calculate the
taxonomic assignment. The same phylogenetic principles are used to
achieve a high assignment precision.\\
The second method in this thesis helps researchers to reconstruct
individual genomes. It is a statistical classification model for
metagenome data, for which we outline several direct and follow-up
applications. These include classification of nucleotide sequences to
individual genomes, \emph{de-novo} calculation of genome clusters in
metagenomes, \emph{in-silico} sample enrichment for genomes and quality
checking of reconstructed genomes. We published the method as a software
library named \emph{MGLEX} for integration into other programs to enable
the efficient use of the data for reconstructing genomes in different
scenarios.\\
Presumably, metagenomics will continue to play an important role in
microbial research, and may partially obviate the sequencing of cloned
strain genomes. This trend is supported by the rapid development of DNA
sequencing technologies, which is progressing towards faster sequencing
and longer reads. The presented methods supplement the existing set of
bioinformatics tools for acquiring knowledge from metagenomes. By
reducing metagenomes to individual genomes, one can apply traditional
algorithms from genomics, for instance to reconstruct metabolic
pathways, and one can link data from transcriptomic and proteomic
experiments. Therefore, there is much interest in genome reconstruction
methods, like the ones presented in this thesis.
\newpage
%-------------------------------------
%	Abstract German
%-------------------------------------
\cleardoublepage

{
  \selectlanguage{ngerman}
	\smhd{Zusammenfassung}
	
	\vspace*{0.5cm}
	\frenchspacing
	Mikrobielle Gemeinschaften existieren praktisch überall, in
Biogas-Anlagen, heißen Quellen am Meeresgrund, auf der Oberfläche von
Pflanzenblättern und -wurzeln und auch im menschlichen Körper, welcher
z.~B. im Verdauungstrakt an genetisch fremden Zellen ein Vielfaches
seiner selbst beherbergt. Sie können aus Abertausenden von
Mikroorganismen, wie Bakterien, Archäen, Algen und Pilzen, bestehen, die
innerhalb ihrer Umgebung koexistieren und auf Grund ihrer komplexen
wechselseitigen Abhängigkeiten und speziellen Umgebungsanforderungen
nicht ohne Weiteres isoliert, kultiviert und untersucht werden können.
Das Feld der Metagenomik widmet sich der genetischen Analyse dieser
Gemeinschaften. Die Gene ihrer Mitglieder sichern ihnen das Überleben,
indem sie unter anderem Nahrung verwertbar machen, Gifte neutralisieren
oder Symbiosen mit anderen Organismen ermöglichen. Durch die Technik der
Gensequenzierung kann man diesen genetischen Reichtum untersuchen und
für Anwendungen nutzbar machen, z.~B. in Form von neuen Antibiotika oder
als Enzyme in der Biotechnologie. Abgesehen von dem großen ökonomischen
Potential ermöglicht die Metagenomik ein fundamental besseres
Verständnis der mikrobiologischen Prozesse auf unserer Erde.\\
Auf direktem Weg können nach heutigem technischen Stand noch keine
zusammenhängenden Genome der mikrobiellen Gemeinschaften sequenziert
werden. Vielmehr ergeben sich viele kurze DNA-Abschnitte, sogenannte
Reads, die durch Computerprogramme zu Gen- und längeren Genom-Sequenzen
zusammengesetzt werden müssen, was sich je nach Größe und Komplexität
des Metagenoms als sehr schwierig erweisen kann. Diese Doktorarbeit
beschreibt zwei Methoden, die das Ziel verfolgen, metagenomische
Sequenzen bestimmten taxonomischen Gruppen oder Genomen zuzuordnen.
Dadurch können die Gene bzw. ihre zugehörigen Proteine und Funktionen im
phylogenetischen und genetischen Kontextes analysieren werden, um so ein
besseres Verständnis der Funktionsweise der Organismen und der
mikrobiellen Gemeinschaft zu erlangen.\\
Das erste Methode, \emph{taxator-tk}, weist Nukleotidsequenzen aus
Metagenomen bestimmten Taxa zu und begegnet dabei zwei
Herausforderungen: zum einen der präzisen Vorhersage und zum anderen der
Anwendbarkeit auf Datensätzen, deren Größe mit dem rapiden Fortschritt
der DNA-Sequenzierung stetig ansteigt. Annotationsmethoden wie
\emph{taxator-tk}, die auf Ähnlichkeit zu bereits bekannten Genomen
setzen, benötigen einen beträchtlichen Teil ihrer Laufzeit für die
Berechnung der Sequenzähnlichkeiten. Daher nutzt unser Algorithmus die
zugrunde liegende phylogenetische Struktur ähnlicher Gensequenzen zur
effizienten Berechnung einer taxonomischen Vorhersage. Durch die
Anwendung der gleichen phylogenetischen Prinzipien erreicht er eine hohe
Präzision der Vorhersagen.\\
Die zweite in dieser Arbeit vorgestellte Methode unterstützt Forscher
bei der Rekonstruktion einzelner Genome. Es handelt sich um ein
statistisches Klassifikationsmodell für Metagenomdaten, für das
zahlreiche direkte und weitergehende Anwendungsmöglichkeiten skizziert
werden. Diese umfassen die Klassifizierung von Nukleotidsequenzen nach
Genomen, die \emph{de-novo}-Berechnung von Genom-Clustern, die
\emph{in-silico} Anreicherung von Genomsequenzdaten und die
Qualitätskontrolle rekonstruierter Genome. Die Methode wurde als
Software-Bibliothek namens \emph{MGLEX} zur Verwendung in anderen
Programmen veröffentlicht und ermöglicht dadurch eine effiziente
Datenverwertung bei der Rekonstruktion von Genomen in unterschiedlichen
Situationen.\\
Es ist zu erwarten, dass die Metagenomik eine wichtige Rolle in der
mikrobiologischen Forschung spielen und zunehmend in Konkurrenz zur
Genomsequenzierung geklonter Stämme treten wird. Diese Prognose wird
auch durch die rasante Entwicklung der DNA-Sequenziertechniken getragen,
die eine immer schnellere Sequenzierung immer längerer Reads
ermöglichen. Die hier vorgestellten Methoden ergänzen das Repertoire
vorhandener Bioinformatik-Werkzeuge zur Gewinnung von Erkenntnissen aus
Metagenomen. Die Reduzierung von Metagenomen auf einzelne Genome
ermöglicht sowohl die Anwendung klassischer Algorithmen der Genomik, z.
B. zur Rekonstruktion von Stoffwechselpfaden, als auch die Verknüpfung
mit experimentellen Daten der Transkriptomik und Proteomik. Daher sind
Verfahren zur Rekonstruktion einzelner Genome, wie sie in dieser Arbeit
vorgestellt werden, von großem generellem Interesse.
}
\vspace*{0.5cm}
\newpage
%-------------------------------------
%	Acknoledgements German
%-------------------------------------
%\cleardoublepage

{
  \selectlanguage{ngerman}
	\smhd{Danksagung}

	\vspace*{0.5cm}
	\frenchspacing
	Zahlreiche Personen haben mich im Verlauf meiner Promotion begleitet und
unterstützt. Ihnen gebührt mein vollster Dank, auch wenn ich an dieser
Stelle nicht alle erwähnen kann. Ohne die Hilfe, die wissenschaftliche
Expertise und die Ideen von Alice McHardy wäre diese Arbeit nicht
möglich gewesen. Außerdem bedanke ich mich bei meinen beiden Koautoren
Alexander Schönhuth und Ivan Gregor, die mich in der Konzeption und
Ausführung meiner Publikationen unterstützt haben. Unersetzlich war für
mich über die gesamte Zeit meiner Promotion auch die Unterstützung durch
meine Freundin Diana Rodriguez und meine Familie. Durch die gemeinsame
Arbeit in unserer Forschungsgruppe habe ich mich immer bestärkt gefühlt,
wofür ich mich bei meinen zahlreichen Kollegen bedanke. Hervorheben will
ich Aaron Weimann und David Lähnemann für die vielen fachlichen
Diskussionen, die gemeinsamen Erlebnisse und ihren fortwährenden Einsatz
für ein gutes Betriebsklima. Für ihre andauernde Hilfe in
organisatorischen Angelegenheiten und ihre wohlwollende Art möchte ich
mich zuletzt auch bei Angela Rennwanz bedanken.
}
\vspace*{0.5cm}
\newpage
%\selectlanguage{english}

%-------------------------------------
%	Table of content
%-------------------------------------

%---- fancy page layout ----
\pagestyle{fancy}
\renewcommand{\chaptermark}[1]{\markboth{\MakeUppercase{Chapter \thechapter. #1 }}{}}
\renewcommand{\sectionmark}[1]{\markright{\thesection\ #1}}
\fancyhf{}
\fancyhead[RO]{\rightmark \hspace{0.5cm} \textbf{\thepage}}
\fancyhead[LE]{\textbf{\thepage} \hspace{0.5cm} \leftmark}
\renewcommand{\headrulewidth}{0.5pt}
\renewcommand{\footrulewidth}{0pt}
\addtolength{\headheight}{0.5pt}
\fancypagestyle{plain}{
  \fancyhead{}
  \renewcommand{\headrulewidth}{0pt}
}

{
  %---- table of contents ----
  %\cleardoublepage
  \setcounter{secnumdepth}{2}
  \setcounter{tocdepth}{2}
  \tableofcontents

  %---- figure index ----
  %\cleardoublepage
  \renewcommand\listfigurename{List of Figures}  % title can be changed here
  \listoffigures

  %---- list index ----
  %\cleardoublepage
  \renewcommand\listtablename{List of Tables}  % title can be changed here
  \listoftables

  \cleardoublepage
  \markboth{\MakeUppercase{\nomname}}{\nomname}
  %define acronyms
  %\glsaddall
  %\hypersetup{linkcolor=black}
  %\setcounter{tocdepth}{2}
  %\tableofcontents
}
\mainmatter
\onehalfspacing

\chapter{Synopsis}\label{synopsis}

\section{Metagenomics}\label{metagenomics}

Metagenomics is a more recent variant of genomics which pursues medical
or ecological questions at the scale of microbial communities using
nucleotide sequencing. In contrast to microbial genomics, which is
focused on single strains traditionally grown in lab cultures before
genome sequencing, the metagenomic approach applies direct sampling from
a natural ecosystem without cultivation. Microbes form so-called
communities in their micro-environment because they interact, for
instance by symbiosis (e.g.~sharing metabolites) or competition
(e.g.~for food). Such a community may consist of hundreds or thousands
of different species, which are connected by complex interactions (Berry
\& Widder, \protect\hyperlink{ref-BerryDeciphering2014}{2014}; Fuhrman,
Cram \& Needham, \protect\hyperlink{ref-FuhrmanMarine2015}{2015}). It is
the principal interest of microbial ecology to understand these
interaction networks, which make it difficult to isolate and grow the
organisms on culture medium because the specific cultivation conditions
cannot be reproduced (Riesenfeld, Schloss \& Handelsman,
\protect\hyperlink{ref-RiesenfeldMetagenomics2004}{2004}; Stewart,
\protect\hyperlink{ref-StewartGrowing2012}{2012}). However, by
extracting and sequencing environmental DNA directly after sampling, one
can capture the genomes of all community members, although in a highly
fractional and usually incomplete form. One could say that current
metagenomics trades the species-level resolution and the completeness of
very few genomes for a higher level view on the genes in a community.
The metagenome, a term coined in the early 2000 (Rondon et al.,
\protect\hyperlink{ref-RondonCloning2000}{2000}; Riesenfeld, Schloss \&
Handelsman, \protect\hyperlink{ref-RiesenfeldMetagenomics2004}{2004};
Tyson et al., \protect\hyperlink{ref-TysonCommunity2004}{2004}), stands
for all the genes in a microbial community. These genes determine the
ecological functions of the community members through the proteins they
encode. Metagenome sequencing can thus collect new environmental genes
and discover protein functions with potential use in medicine and
biotechnology, and provides a way to understand the microbial
interactions within diverse ecosystems. It has been used to study many
different environments (Figure~\ref{fig:metagenomes_environments}).

\begin{figure}[htbp]
\centering
\includegraphics{figure/metagenome_title_clusters.pdf}
\caption[Microbial environments extracted from 8211 publication titles]{Microbial environments extracted from 10,043 publication titles (2011-2017) positioned by cooccurence in publication titles. The articles were selected by topic and the corresponding metadata downloaded from Europe PMC (europepmc.org). The titles were then reduced to environment-related words and these were grouped by the number of cooccurences using Gephi (gephi.org) with a force-directed layout and subsequent annotation of clusters. Three major clusters emerge, relating to aquatic environments, soil and plant biomass degradation and (human) host-related environments.}\label{fig:metagenomes_environments}
\end{figure}

Early metagenomic studies have impressively demonstrated the potential
of this new approach. For instance, new antibiotics and antibiotic
resistance genes were identified (Gillespie et al.,
\protect\hyperlink{ref-GillespieIsolation2002}{2002}; Riesenfeld,
Goodman \& Handelsman,
\protect\hyperlink{ref-RiesenfeldUncultured2004}{2004}). An ocean survey
(Venter et al., \protect\hyperlink{ref-VenterEnvironmental2004}{2004})
revealed hundreds of new rhodopsin-like genes in seawater environments
(rhodopsin is an essential protein to sensing light) among over 1.2
million novel genes. In the following, numerous micro-environments were
explored to provide a census of genes and species, many of them
previously unknown. For the various sites in and on the human body,
which represent well-studied environments due to medical applications,
the resulting data provided new insight into the interactions between
the human host and its so-called microbiome. For instance abnormal
microbial colonization of the gut was observed with chronic inflammation
(Qin et al., \protect\hyperlink{ref-QinHuman2010}{2010}). Although most
investigations have focused on the bacteria, the best known domain in
the microbial tree of life, metagenomics has also been used to study the
genes of archaea, microscopic eukaryotes, viruses and genetic elements
like plasmids (Hugenholtz \& Tyson,
\protect\hyperlink{ref-HugenholtzMicrobiology2008}{2008}; Cuvelier et
al., \protect\hyperlink{ref-CuvelierTargeted2010}{2010}; Garrett et al.,
\protect\hyperlink{ref-GarrettMetagenomic2010}{2010}), which helped to
broaden the view on the global genetic repertoire of life and its
evolution.

\subsection{DNA sequencing}\label{dna-sequencing}

Past and present progress in the field of metagenomics is tightly
coupled to the development of next-generation sequencing technologies
(NGS). While earlier studies were based on the Sanger sequencing
technology (Wommack, Bhavsar \& Ravel,
\protect\hyperlink{ref-WommackMetagenomics2008}{2008}), the underlying
chemistry has been subject to many improvements, such as the engineering
of highly parallel reaction and detection procedures. This has led to an
considerable drop in overall time and cost of nucleotide sequencing
(Dröge \& McHardy, \protect\hyperlink{ref-DrogeTaxonomic2012}{2012}).
The first sequencing approaches in metagenomics targeted well studied
single genes, predominantly the bacterial and archaeal gene of the
ribosomal 16S subunit (Quince, Curtis \& Sloan,
\protect\hyperlink{ref-QuinceRational2008}{2008}; Hamady \& Knight,
\protect\hyperlink{ref-HamadyMicrobial2009}{2009}), which is a good
taxonmic marker because it contains both conserved and divergent
regions. In this context, sequences identity thresholds were applied to
define operational taxonomic units (OTUs) as an approximate species
replacement. The variable regions were amplified in a polymerase chain
reaction (PCR) before sequencing and are therefore called amplicons.
Using this selective approach reduced the amount of target DNA from
millions of bases per genome to a few hundreds while giving estimates of
genetic species diversity. Amplicon sequencing is still in use and
represents a cost-effective way to study the taxonomic composition and
taxon abundances. However, it cannot be used to discover the functional
potential unless the corresponding genome sequences are available for
consideration. To target novel community genomes, universal sequencing
primers initiate sequencing at random starting positions on the DNA
strands. This approach is called shotgun sequencing due to the fact that
the reads are more or less randomly scattered over the entire genome
sequence. With a sufficient number of reads, metagenomic shotgun
sequencing can cover most genes and continues to evolve together with
next-generation sequencing platforms, but also with respect to
experimental protocols and data analysis methods. A major limitation of
current sequencing technologies is the length of the primary sequencing
products (reads). In particular, the currently dominating Illumina
sequencing platform produces reads which are still much shorter than
typical genes (Dröge \& McHardy,
\protect\hyperlink{ref-DrogeTaxonomic2012}{2012}) so that overlapping
reads are typically assembled to form longer contiguous sequences
(contigs) (Miller, Koren \& Sutton,
\protect\hyperlink{ref-MillerAssembly2010}{2010}). New technologies such
as PacBio and Oxford Nanopore sequencing yield longer reads but have
larger error rates and higher costs compared to Illumina, which limits
their current use in metagenomics (Goodwin, McPherson \& McCombie,
\protect\hyperlink{ref-GoodwinComing2016}{2016}).

Metagenomic studies have highlighted the advantages of metagenomic over
the traditional sequencing approach using isolated and cultured strains.
The genomes of environmental microorganisms were found to be much more
genetically diverse than those of corresponding lab strains (Tyson et
al., \protect\hyperlink{ref-TysonCommunity2004}{2004}; Handelsman,
\protect\hyperlink{ref-HandelsmanMetagenomics2004}{2004}), which
essentially represent clones of a single cell. Researchers also become
more aware of the fact that genetic data collections are strongly biased
towards taxa which are easily grown in lab cultures and which are of
medical relevance, leaving many black spots in the microbial tree of
life (Tyson et al., \protect\hyperlink{ref-TysonCommunity2004}{2004}; Wu
et al., \protect\hyperlink{ref-WuPhylogenydriven2009}{2009}). Using the
exploratory metagenomics approach, there is no need to narrow the focus
on certain species and to hypothesize about the role of these organisms
in their environment beforehand. The bird's eye view on the genes helps
to identify mutual dependencies, such as pathways that are connected
between different genomes (Ponomarova \& Patil,
\protect\hyperlink{ref-PonomarovaMetabolic2015}{2015}), and to associate
new functions and new species. Apart from this, direct sequencing also
creates new problems. Some sequencing platforms introduce a bias related
to the nucleotide composition (Dohm et al.,
\protect\hyperlink{ref-DohmSubstantial2008}{2008}), which may affect the
analysis. In general, it is difficult to distinguish sequencing errors
from natural genetic variation, which, in some cases, could lead to
wrong conclusions such as inflated microbial diversity estimates (Quince
et al., \protect\hyperlink{ref-QuinceAccurate2009}{2009}; Kunin et al.,
\protect\hyperlink{ref-KuninWrinkles2010}{2010}). Another problem with
this sequence heterogeneity is that longer genome sequences often fail
to assemble due to the natural and artificial nucleotide variations in
the reads (Melsted \& Pritchard,
\protect\hyperlink{ref-MelstedEfficient2011}{2011}; Pell et al.,
\protect\hyperlink{ref-PellScaling2012a}{2012}). Typical metagenome data
therefore contain many incomplete genes whose origin and functional role
needs to be determined.

\subsection{The role of computer
programs}\label{the-role-of-computer-programs}

Today's genomic data are ubiquitous and abundant due to high-throughput
nucleotide sequencing. Consequently, the data generation marks a
starting point of knowledge discovery, making modern metagenomics in
large part a data-driven science in which algorithms have replaced lab
techniques to sort and analyze genetic material. Metagenome data are
large (because they represent many genomes) and require extensive
processing to deal with the phylogenetic and genetic diversity in the
sample. It is convenient to divide the downstream processing of raw
sequencing data into three consecutive steps which are illustrated in
Figure~\ref{fig:metagenome_processing_steps}: (a) sequence processing
specific to the sequencing platform and often performed by proprietary
software; (b) metagenome analysis and reduction to non-redundant draft
genome sequences; (c) algorithms to study the individual genomes and how
they interact. Step (a) applies not only to metagenomics but to all
sciences using nucleotide sequencing and, from a practical perspective,
decouples downstream algorithms from the specifics of sequencing
technology and its development. The work presented in this thesis
contributes to step (b), to prepare the data for use in downstream
algorithms in step (c), which are tailored to the biological questions.

\begin{figure}[htbp]
\centering
\includegraphics{figure/metagenome_processing_steps.pdf}
\caption[Major steps in metagenome data processing]{Major steps in metagenome data processing. Typical processing consists of three consecutive levels: (a) read processing (b) contig analysis and binning and (c) the analysis at the genome level.}\label{fig:metagenome_processing_steps}
\end{figure}

An important step following nucleotide sequencing is the assembly of
overlapping reads into longer contigs. For this, many reads must be
sequenced to cover the corresponding genome positions. In current
Illumina sequencing protocols, pairs of reads are typically linked in
the experimental library preparation (Goodwin, McPherson \& McCombie,
\protect\hyperlink{ref-GoodwinComing2016}{2016}) to capture their
relative orientation and approximate distance (insert size). This
information helps to construct longer contigs, because otherwise
repetitive regions or homologous genes which are longer than the read
length cannot be distinguished if they cause loops in the assembly graph
(Ghurye, Cepeda-Espinoza \& Pop,
\protect\hyperlink{ref-GhuryeMetagenomic2016}{2016}). When the read
coverage drops for intermediate regions, the corresponding genomes also
break into multiple shorter contigs. Existing assemblers for isolate
genome assembly, which has been available for a long time (Sutton et
al., \protect\hyperlink{ref-SuttonTigr1995}{1995}; Huang \& Madan,
\protect\hyperlink{ref-HuangCap31999}{1999}), has been adjusted to
assemble metagenomes (Ghurye, Cepeda-Espinoza \& Pop,
\protect\hyperlink{ref-GhuryeMetagenomic2016}{2016}). Metagenome
assemblers must cope with the natural genetic variance of strains
compared to clonal DNA and must also take into account that, due to
different abundances in the sample, the number of genome copies varies
considerably among the species or strains, resulting in a large range of
read coverages. The assembly of reads for complex communities is
considered an algorithmic challenge, but often reduces the amount of
data considerably and produces a fraction of longer contigs which
represent full or partial genes. Assembly is therefore a reasonable
first step towards recovering the full genome sequence of environmental
microbes. In the workflow Figure~\ref{fig:metagenome_processing_steps},
the assembly bridges steps (a) and (b) because the input sequencing
reads have a length and error profile which is specific to the
sequencing platform but the output contigs represent generic sequences
with most errors removed.

Genomic methods frequently operate on complete genome sequences, for
instance inferring functional models for specific organisms (Price, Reed
\& Palsson, \protect\hyperlink{ref-PriceGenomescale2004}{2004}). Gene
regions are identified, their corresponding protein sequences determined
and hypothetical pathways constructed. To do similar in metagenomics,
contigs are often grouped to form hypothetical draft genomes, called
genome bins. The binning process tries to reconstruct the genomes and
solves a problem which, at first, appears very similar to that of
metagenome assembly. However, contig binning is usually independent of
the sequencing platform (it makes no use of sequencing quality) and
considers information which assembly programs ignore (e.g.~gene
annotations). Both steps can be iterated in a feedback cycle
(Figure~\ref{fig:assembly_binning_cycle}) to improve the quality of the
resulting genomes (Albertsen et al.,
\protect\hyperlink{ref-AlbertsenGenome2013}{2013}). Metagenome binning
connects step (b) and (c) in
Figure~\ref{fig:metagenome_processing_steps} because it reduces the data
to individual genomes. This thesis presents algorithms related to the
binning problem which I, in collaboration with my colleagues, developed
and published during my doctoral studies.

\begin{figure}[htbp]
\centering
\includegraphics{figure/assembly_binning_cycle.pdf}
\caption[Assembly and binning cycle]{Assembly and binning cycle for genome reconstruction in metagenomes. Longer contigs yield better preliminary genome bins and when collecting the reads within a bin, these are more specific to the genome and lead to better assembly.}\label{fig:assembly_binning_cycle}
\end{figure}

\subsection{Community transcriptomics, proteomics and
metabolomics}\label{community-transcriptomics-proteomics-and-metabolomics}

Nucleotide gene sequences can only tell about potential functions of an
organism but there may be much more to discover. For instance, we are
interested in seeing genes which are actively expressed and to
understand how the gene expression is regulated within the community.
The proteins, for which the genes code, are the acting agents in any
organism, so it is most important to determine the functional role of
proteins, how they interact, and which metabolites they target and
mediate. Corresponding experimental techniques for transcriptome,
proteome and metabolome analysis are being adapted and applied to
microbial communities (Turnbaugh \& Gordon,
\protect\hyperlink{ref-TurnbaughInvitation2008}{2008}; Aguiar-Pulido et
al., \protect\hyperlink{ref-AguiarpulidoMetagenomics2016}{2016}). Such
data representing cellular activity are most informative when they can
be linked to the corresponding gene sequences and genomes so that their
regulation and coupling can be studied in detail. The genomes bins
derived by metagenome binning can form the basis to build models which
can integrate information from other experiments, for instance measuring
the current state of a community in terms of genome activity,
micro-evolution or population dynamics.

\section{Metagenome binning}\label{metagenome-binning}

Functional screenings of metagenomes (Ufarté, Potocki-Veronese \&
Laville, \protect\hyperlink{ref-UfarteDiscovery2015}{2015}) aim to
identify novel enzymes with biotechnological and medical applications.
Though, when studying protein-coding genes and their regulation in more
detail, it is often beneficial to look at the corresponding genomes to
understand the genomic context. One way to collect cells and to retrieve
a full genome sequence is by sampling from the environment followed by
cultivation and sequencing, alternatively using enrichment cultures
(Dong et al., \protect\hyperlink{ref-DongReconstructing2017}{2017}) or
single-cell sequencing (Woyke et al.,
\protect\hyperlink{ref-WoykeAssembling2009}{2009},
\protect\hyperlink{ref-WoykeOne2010}{2010}). However, it can be
difficult to extract specific organisms if there are hundreds or
thousands of distinct species, subspecies or OTUs in a metagenomic
sample (Woyke et al., \protect\hyperlink{ref-WoykeAssembling2009}{2009},
\protect\hyperlink{ref-WoykeOne2010}{2010}; Hess et al.,
\protect\hyperlink{ref-HessMetagenomic2011}{2011}). Furthermore, the
cultivation conditions required to produce clone libraries may be
unknown, and environmental sequencing of extracted cells with small
amounts of DNA is still in its infancy (Mende et al.,
\protect\hyperlink{ref-MendeImproved2016}{2016}; Yu et al.,
\protect\hyperlink{ref-YuMicrofluidicbased2017}{2017}). For these
reasons, \emph{in-silico} metagenomic methods provide a solid
alternative. Metagenome sequence binning is the algorithmic equivalent
for reconstructing individual genomes from shotgun metagenome sequence
data. Broadly speaking, a genome bin is a set of sequences, usually
assembled contigs, which together present the sequenced part of a
specific community genome. Capturing these partial genomes allows
studying taxa on the level of genes and their associated functions.
Genome binning aims to recover full genomes whereas taxonomic binning
refers to the assignment of contigs to broader taxonomic groups. For an
extensive introduction to metagenome binning, see the review article
(Dröge \& McHardy, \protect\hyperlink{ref-DrogeTaxonomic2012}{2012}) in
appendix C.

\subsection{Binning methodology}\label{binning-methodology}

Binning represents a machine learning procedure in which class labels
(genomes or taxa) are assigned to data points (contigs) (see Hastie,
Tibshirani \& Friedman
(\protect\hyperlink{ref-HastieElements2001}{2001}) chapter 1, for a
comprehensive introduction to these concepts). Most of the different
algorithmic approaches to infer genome bins are either a form of data
clustering or classification, including combinations of both approaches.
Clustering is a so-called unsupervised method, which does not directly
take into account external information like available genome sequences.
The strength of clustering is that it can group any data to explore
their intrinsic structure, being able to group contigs of genomes which
have never been seen before. In contrast, classification algorithms
utilize categorized (labeled) data, for instance large genome sequence
collections, to assign sequences to genome bins. They are said to
operate in a supervised manner. By the use of prior knowledge they can
be very efficient but a major drawback is the difficulty to handle novel
genomes. Clustering and classification methods give complementary
results and it is common to combine them, for instance classifying
genome bins after clustering or initializing clusters using
classification labels (Imelfort et al.,
\protect\hyperlink{ref-ImelfortGroopm2014}{2014}).

\subsection{Sequence information for
binning}\label{sequence-information-for-binning}

Binning methods can also be categorized by the kind of information they
use. Both clustering or classification methods for binning operate on
so-called features derived from reads or contigs. These properties
inform about genome membership and discriminate contigs of different
genomes. Microbial genomes sequences expose characteristic frequencies
of short nucleotide motifs (Karlin, Mrazek \& Campbell,
\protect\hyperlink{ref-KarlinCompositional1997}{1997}) which are often
used in binning and refered to as the genome or nucleotide composition.
The combined relative frequency of guanine and cytosine (GC-content) is
a simple way to represent nucleotide composition, and an evolutionary
trait of genomes that has long been used to characterize different
species. For instance, many Actinobacteria expose a high GC-content.
Most methods, however, use short nucleotide motifs consisting of 4 to 7
bases called \(k\)-mers (\(k\) stands for the number of bases).
Alternative formulations may use Hidden Markow Models (HMMs) to describe
nucleotide composition (Brady \& Salzberg,
\protect\hyperlink{ref-BradyPhymm2009}{2009}). The second major feature
type for binning is read coverage, the amount of sequencing reads for
each assembled contig. Since contigs are constructed by stacking
(aligning) overlapping reads, each nucleotide position of a resulting
contig must be covered by at least a single read, but typically many
more. Following random shotgun sequencing with universal primers, the
expected number of reads covering a single position is approximately
proportional to the genome copy number in the sequenced sample (Lander
\& Waterman, \protect\hyperlink{ref-LanderGenomic1988}{1988}), with a
constant factor which depends on the total sequencing effort. Thus
contig coverage helps to discriminate genomes with distinct sample
abundances, but cannot differentiate between equally abundant genomes.
It is therefore desirable to generate multiple metagenome samples of a
community for which the genome copy numbers vary differently. This way,
each genome has a unique set of genome abundances. Recent studies have
shown that genome abundances represent a very informative feature type
to obtain genome bins for complex metagenomes, if many varying samples
are available (Albertsen et al.,
\protect\hyperlink{ref-AlbertsenGenome2013}{2013}; Alneberg et al.,
\protect\hyperlink{ref-AlnebergBinning2014}{2014}). Sometimes, binning
programs may also employ assembly information such as associated contigs
or scaffolds linked by paired reads (Lu et al.,
\protect\hyperlink{ref-LuCocacola2016}{2016}), but such information, if
available, is more frequently used to assess the binning quality (Patil
et al., \protect\hyperlink{ref-PatilTaxonomic2011}{2011}) or to refine
genome bins (Alneberg et al.,
\protect\hyperlink{ref-AlnebergBinning2014}{2014}).

There is a specific class of homology-based classifiers, and an example
of such a method is described in Section~\ref{sec:synopsis_taxator-tk}.
These methods employ a two-step procedure, first identifying potential
homologs for a contig, for instance by alignment to reference sequences,
and second determining a corresponding evolutionary neighborhood. This
neighborhood is usually reported by taxonomy, so that each contig is
annotated with a taxonomic path. A grouping of contigs by taxa then
provides a form of binning but higher-level taxon bins mix contigs from
several genomes, if the sample contains more than a single member of
this group. Hence, taxonomic classification using sequence similarity
can only provide a partial solution to the binning problem. However,
such annotation also informs about the taxonomic sample composition and
diversity, similar to a 16S gene analysis, and may furthermore be used
as secondary features for clustering, for instance to initialize genome
clusters (Imelfort et al.,
\protect\hyperlink{ref-ImelfortGroopm2014}{2014}) or to train a
classification model with sample data (Gregor et al.,
\protect\hyperlink{ref-GregorPhylopythias2016}{2016}; Dong et al.,
\protect\hyperlink{ref-DongReconstructing2017}{2017}). The probabilistic
binning framework presented in Section~\ref{sec:synopsis_mglex} makes
full use of taxonomic annotations similar to the use of nucleotide
composition and contig coverage.

\subsection{Overview of binning
software}\label{overview-of-binning-software}

Binning programs emerged and evolved together with metagenome sequencing
and assembly protocols, so that their focus changed accordingly. Recent
programs for complex communities target longer contigs (1 kb or more)
but some programs were also designed to bin raw sequencing reads (Vinh
et al., \protect\hyperlink{ref-VinhTwophase2015}{2015}; Ulyantsev et
al., \protect\hyperlink{ref-UlyantsevMetafast2016}{2016}), for instance
by comparison to genome sequence collections or nucleotide composition.
Since the latter is unstable for short sequences due to low number of
counts (McHardy et al.,
\protect\hyperlink{ref-MchardyAccurate2007}{2007}), these programs are
inherently limited to simple communities and community members with
related genome sequences to compare to. Most newer binning programs with
applications to complex metagenomes, which are listed in
Table~\ref{tbl:binning_programs}, operate on co-assembled contigs, which
are constructed using multiple sequenced samples of a microbial
community.

\hypertarget{tbl:binning_programs}{}
\begin{longtable}[]{@{}llllll@{}}
\caption[List of contig binning programs]{\label{tbl:binning_programs}Contig binning programs with type
(taxon or genome bins), methodology and release dates starting from the
year 2011 up to the year 2016. This is a non-exhaustive list with rough
methodology descriptions. Some programs employ additional sequence
information in post-processing procedures which may be omitted here. A
recent overview of binning methods can be found in (Sedlar, Kupkova \&
Provaznik, \protect\hyperlink{ref-SedlarBioinformatics2017}{2017}).
}\tabularnewline
\toprule
\begin{minipage}[b]{0.18\columnwidth}\raggedright\strut
Program\strut
\end{minipage} & \begin{minipage}[b]{0.10\columnwidth}\raggedright\strut
Type\strut
\end{minipage} & \begin{minipage}[b]{0.16\columnwidth}\raggedright\strut
Technique\strut
\end{minipage} & \begin{minipage}[b]{0.16\columnwidth}\raggedright\strut
Sequence information\strut
\end{minipage} & \begin{minipage}[b]{0.12\columnwidth}\raggedright\strut
Published/ updated\strut
\end{minipage} & \begin{minipage}[b]{0.11\columnwidth}\raggedright\strut
License\strut
\end{minipage}\tabularnewline
\midrule
\endfirsthead
\toprule
\begin{minipage}[b]{0.18\columnwidth}\raggedright\strut
Program\strut
\end{minipage} & \begin{minipage}[b]{0.10\columnwidth}\raggedright\strut
Type\strut
\end{minipage} & \begin{minipage}[b]{0.16\columnwidth}\raggedright\strut
Technique\strut
\end{minipage} & \begin{minipage}[b]{0.16\columnwidth}\raggedright\strut
Sequence information\strut
\end{minipage} & \begin{minipage}[b]{0.12\columnwidth}\raggedright\strut
Published/ updated\strut
\end{minipage} & \begin{minipage}[b]{0.11\columnwidth}\raggedright\strut
License\strut
\end{minipage}\tabularnewline
\midrule
\endhead
\begin{minipage}[t]{0.18\columnwidth}\raggedright\strut
PhyloPythiaS\strut
\end{minipage} & \begin{minipage}[t]{0.10\columnwidth}\raggedright\strut
taxon\strut
\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright\strut
Structured Support Vector Machine (SVM)\strut
\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright\strut
5-mers\strut
\end{minipage} & \begin{minipage}[t]{0.12\columnwidth}\raggedright\strut
(\protect\hyperlink{ref-PatilTaxonomic2011}{2011})/ (2012)\strut
\end{minipage} & \begin{minipage}[t]{0.11\columnwidth}\raggedright\strut
proprietary\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.18\columnwidth}\raggedright\strut
MetaWatt 3.x\strut
\end{minipage} & \begin{minipage}[t]{0.10\columnwidth}\raggedright\strut
genome\strut
\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright\strut
Heuristic thresholds\strut
\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright\strut
4-mers, differential coverage\strut
\end{minipage} & \begin{minipage}[t]{0.12\columnwidth}\raggedright\strut
(\protect\hyperlink{ref-StrousBinning2012}{2012})/ (2015)\strut
\end{minipage} & \begin{minipage}[t]{0.11\columnwidth}\raggedright\strut
AFL\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.18\columnwidth}\raggedright\strut
CONCOCT\strut
\end{minipage} & \begin{minipage}[t]{0.10\columnwidth}\raggedright\strut
genome\strut
\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright\strut
Gaussian mixture clustering\strut
\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright\strut
4-mers, differential coverage\strut
\end{minipage} & \begin{minipage}[t]{0.12\columnwidth}\raggedright\strut
(\protect\hyperlink{ref-AlnebergBinning2014}{2014})/ (2015)\strut
\end{minipage} & \begin{minipage}[t]{0.11\columnwidth}\raggedright\strut
BSD\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.18\columnwidth}\raggedright\strut
GroopM\strut
\end{minipage} & \begin{minipage}[t]{0.10\columnwidth}\raggedright\strut
genome\strut
\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright\strut
Biclustering\strut
\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright\strut
4-mers, differential coverage\strut
\end{minipage} & \begin{minipage}[t]{0.12\columnwidth}\raggedright\strut
(\protect\hyperlink{ref-ImelfortGroopm2014}{2014})/ (2016)\strut
\end{minipage} & \begin{minipage}[t]{0.11\columnwidth}\raggedright\strut
GPL\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.18\columnwidth}\raggedright\strut
MaxBin 2.0\strut
\end{minipage} & \begin{minipage}[t]{0.10\columnwidth}\raggedright\strut
genome\strut
\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright\strut
Expectation-Maximization (EM) clustering\strut
\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright\strut
4-mers, differential coverage\strut
\end{minipage} & \begin{minipage}[t]{0.12\columnwidth}\raggedright\strut
(\protect\hyperlink{ref-WuMaxbin2014}{2014})/ (2016)\strut
\end{minipage} & \begin{minipage}[t]{0.11\columnwidth}\raggedright\strut
BSD\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.18\columnwidth}\raggedright\strut
MetaBAT\strut
\end{minipage} & \begin{minipage}[t]{0.10\columnwidth}\raggedright\strut
genome\strut
\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright\strut
Distance-based clustering\strut
\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright\strut
4-mers, differential coverage\strut
\end{minipage} & \begin{minipage}[t]{0.12\columnwidth}\raggedright\strut
(\protect\hyperlink{ref-KangMetabat2015}{2015})/ (2016)\strut
\end{minipage} & \begin{minipage}[t]{0.11\columnwidth}\raggedright\strut
proprietary\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.18\columnwidth}\raggedright\strut
PhyloPythiaS+\strut
\end{minipage} & \begin{minipage}[t]{0.10\columnwidth}\raggedright\strut
taxon\strut
\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright\strut
Structured Support Vector Machine (SVM)\strut
\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright\strut
5-mers\strut
\end{minipage} & \begin{minipage}[t]{0.12\columnwidth}\raggedright\strut
(\protect\hyperlink{ref-GregorPhylopythias2016}{2016})/ (2014)\strut
\end{minipage} & \begin{minipage}[t]{0.11\columnwidth}\raggedright\strut
proprietary\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.18\columnwidth}\raggedright\strut
MyCC\strut
\end{minipage} & \begin{minipage}[t]{0.10\columnwidth}\raggedright\strut
genome\strut
\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright\strut
Stochastic neighbor embedding\strut
\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright\strut
4-mers, differential coverage\strut
\end{minipage} & \begin{minipage}[t]{0.12\columnwidth}\raggedright\strut
(\protect\hyperlink{ref-LinAccurate2016}{2016})/ (2015)\strut
\end{minipage} & \begin{minipage}[t]{0.11\columnwidth}\raggedright\strut
proprietary\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.18\columnwidth}\raggedright\strut
COCACOLA\strut
\end{minipage} & \begin{minipage}[t]{0.10\columnwidth}\raggedright\strut
genome\strut
\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright\strut
Gaussian mixture clustering\strut
\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright\strut
4-mers, differential coverage, genome co-alignment, paired reads\strut
\end{minipage} & \begin{minipage}[t]{0.12\columnwidth}\raggedright\strut
(\protect\hyperlink{ref-LuCocacola2016}{2016})/ (2016)\strut
\end{minipage} & \begin{minipage}[t]{0.11\columnwidth}\raggedright\strut
GPL\strut
\end{minipage}\tabularnewline
\bottomrule
\end{longtable}

\subsection{Binning performance
considerations}\label{binning-performance-considerations}

Binning methods are best judged in the context of their use cases.
Clearly, an optimal binning would mean to obtain a single bin for each
genome in the community. Suboptimal solutions contain either multiple
smaller bins for a genome or bins with mixed contigs of different
genomes. While the objective is clear, it is impossible to obtain
perfect genomes for real metagenome data if there is not enough
information to discriminate the contigs, especially shorter ones. All of
the increasing number of binning methods typically produce suboptimal
bins, and there is no consensus in the metagenomics community on the
performance metrics for assessing the bins obtained by different methods
and with different benchmark datasets. Initiatives such as the
\href{http://cami-challenge.org/}{Critical Assessment of Metagenome
Interpretation (CAMI)} (Sczyrba et al.,
\protect\hyperlink{ref-SczyrbaCritical2017}{2017}) work towards
establishing a common understanding to judge metagenome binning.
Different views on the binning quality are valid as this depends on
downstream processing and on the specific research questions. For
instance, the estimation of community structure might only require the
construction of precise small-sized bins whereas a hypothetical pathway
reconstruction for certain genomes might tolerate excess genes in the
corresponding genome bins and discard all of the remaining bins.

Multiple factors, such as the number and abundance of taxa, their
phylogenetic structure, availability of reference genome sequences and
computing resources have an impact on binning performance. Binning
algorithms are sensitive to the type of community, for example,
taxonomic sequence classification methods rely on external genome
sequences and, as a direct consequence, suffer from the uneven coverage
of the tree of life by the reference genomes. Thus, poorly studied
environments such as a deep sea vent community are likely too exotic for
classifiers which only use public genome sequences. In contrast,
communities such as the human gut microbiota are well suited to the
classification approach because there are abundant genome data for these
microbes. Another reason why binning methods perform differently may be
rigid assumptions, for instance standard algorithm parameters which are
optimized to give good results in specific scenarios tested and intended
by the authors.

The broad range of applications involving many different microbial
habitats, custom experimental techniques and heterogeneous sequencing
platforms makes it difficult to define a state of the art for binning.
Nonetheless, general trends can be observed. Recent works which have
presented genome bins derived from complex metagenomes often applied
clustering in concatenated and transformed feature spaces (Imelfort et
al., \protect\hyperlink{ref-ImelfortGroopm2014}{2014}; Alneberg et al.,
\protect\hyperlink{ref-AlnebergBinning2014}{2014}; Kang et al.,
\protect\hyperlink{ref-KangMetabat2015}{2015}; Lin \& Liao,
\protect\hyperlink{ref-LinAccurate2016}{2016}), which integrate several
types of features including nucleotide composition and contig coverage
for multiple samples. Nevertheless, deriving high-quality draft genomes
today still relies on manual analysis and processing of genome bins
(Albertsen et al., \protect\hyperlink{ref-AlbertsenGenome2013}{2013};
Eren et al., \protect\hyperlink{ref-ErenAnvi2015}{2015}).

\newpage

\section{Methods in this thesis}\label{methods-in-this-thesis}

\subsection{\texorpdfstring{Taxonomic annotation of metagenomes
(\emph{taxator-tk})}{Taxonomic annotation of metagenomes (taxator-tk)}}\label{sec:synopsis_taxator-tk}

The method article in Section~\ref{sec:full_taxator-tk} describes a
high-performance tool for taxonomic annotation of metagenomes using
phylogenetic principles. The procedure splits the input sequences
(contigs) into smaller separate homology regions (segments), to which it
applies a newly developed realignment placement algorithm (RPA) for
taxonomic classification of these regions. This algorithm calculates
pairwise alignment scores to estimate the phylogenetic distances and
simultaneously approximates a corresponding tree structure. The
alignments are non-exhaustive and are stopped once a good taxon estimate
has been determined or if no phylogenetic signal can be found in the
input. In a final merging step, the subregion predictions are combined
for the full sequence to minimize the error of the predicted taxon. The
corresponding computer program \emph{taxatork-tk} is implemented in C++
and utilizes parallel computation.

\subsubsection{Introduction}\label{introduction}

In metagenomics, we study microbial communities from natural
environments without obtaining cultures. Using sequencing followed by
computational analyses, we can estimate the abundances of taxa, known as
taxonomic profiling, and characterize their metabolic potentials by
sorting nucleotide sequences into genome bins (binning) and predicting
proteins therein. Taxonomic profiling is conceptually different from
taxonomic binning because it only requires (partial) genes, which are
taxonomically informative, and which can be obtained using amplicon
sequencing whereas binning needs to deal with all parts of a genome.
Universal marker genes used for profiling are usually classified by
phylogenetic placement, which considers a gene reference tree of the
corresponding gene as a proxy for the species phylogeny. Random genome
regions, as obtained by shotgun sequencing, typically lack such
reference trees. Therefore, a taxonomy is used instead and query
sequences are compared to reference genomes, which are annotated with
corresponding taxa. Such comparison can be done based on direct sequence
matching or based on nucleotide sequence composition, for instance
\(k\)-mers, which also allows recovering draft genomes from
deep-branching lineages. However, sequence matching by alignment is more
accurate, in particular for sequences shorter than 1 kb. Corresponding
algorithms use alignment scores and threshold parameters to quickly
determine an evolutionary neighborhood of a query but lack a
well-motivated evolutionary framework. Calculating de-novo gene trees
for every query in the metagenome is computationally too demanding for
large metagenome samples. The software \emph{taxator-tk} extends the
traditional score-based approach by approximating phylogenetic gene
trees using a linear number of pairwise alignments and thereby provides
more accurate taxonomic assignments without requiring conservation
threshold parameters.

\subsubsection{Methods}\label{methods}

The workflow for the taxonomic assignment of a query sequence consists
of three parts (Figure~\ref{fig:taxatortk_workflow}): (a) a local
alignment search for homologs, (b) the core assignment algorithm and (c)
a post-processing step to merge subregion annotations. The initial
search can be run by different aligners and using different reference
sequence collections. Based on the resulting local alignments, each
query sequence is split into distinct subregions (segments), omitting
parts which have no similarity to any reference. This step reduces the
overall number of positions for further alignments and accounts for
genome arrangements. Each segment, along with its homologous reference
sequences, is processed by the core algorithm to predict a taxon. The
final merging step considers all segment predictions of a query sequence
and determines the final taxon for assignment.

\begin{figure}[htbp]
\centering
\includegraphics{figure/publication_taxatortk/figure_1.pdf}
\caption[Workflow diagram for taxator-tk]{Workflow diagram for the taxonomic assignment of a nucleotide query sequence with : (a) Homology search for query sequence in reference collection using local alignment; (b) program taxator splits the query into distinct segments and determines a taxon ID for each; (c) program binner determines a consensus taxon ID for the entire query from the segment predictions.}\label{fig:taxatortk_workflow}
\end{figure}

The core realignment placement algorithm (RPA)
(Figure~\ref{fig:taxatortk_rpa}) assigns a taxon Q to a query segment
\emph{q} using a limited number of pairwise alignments among \emph{q}
and its homologous segments obtained by local alignment to reference
sequences. It aims to identify a set of segments which form a
monophyletic group or subtree in the corresponding phylogeny. First, the
most similar segment \emph{s} is aligned to the query \emph{q} and all
other segments in the set (pass 1). An outgroup segment \emph{o} is
determined as the first sequence with distance larger than
\(distance(s,q)\). The taxa of all segments with distance smaller or
equal to \(distance(s,o)\) are added to the neighborhood set M. Then,
all segments are aligned to the outgroup segment \emph{o} (pass 2),
again adding taxa with distances smaller than \(distance(o,q)\) to M. We
assign the least common ancestor (LCA) of all taxa in M to segment
\emph{q}. The segments in M form a subtree among all available segment
taxa. Sometimes, if no outgroup can be found or if the taxa in M are
very diverse, the algorithm terminates and the predicted taxon is the
taxonomy root, meaning unassigned. The RPA requires approximately \(2n\)
alignments, where \(n\) is the number of reference segments.

\begin{figure}[htbp]
\centering
\includegraphics{figure/publication_taxatortk/figure_2.pdf}
\caption[Realignment placement algorithm (RPA) steps]{Realignment placement algorithm (RPA) for labeling a query segment \textit{q} with a taxon ID. (a) Underlying taxonomy with query taxon Q and reference taxa A, B, C, D, O and S which is approximated by the query segment alignment. (b) Approximate graph representing pairwise distances between the taxa. The subgraph for clade X is highlighted. (c, d) The two alignment passes which add segment taxa to an empty set \textit{M}. Segment \textit{s} is the segment with the smallest local alignment score (distance) to \textit{q} in the initial similarity search. (c) First, all segments are aligned to segment \textit{s}. The resulting distances are ordered and the taxa with equal or smaller distances than \textit{distance(s,q)} are added to \textit{M}. The outgroup segment \textit{o} is the next most similar segment to \textit{s} after \textit{q}, with \textit{distance(o,s)} \textit{distance(s,q)}. (d) All segments are aligned to \textit{o}. From the ranked distances, taxa with distances smaller than \textit{distance(o,q)} are also added to \textit{M}. Thus, \textit{M} includes all the nearest evolutionary neighbors for the query segment \textit{q} (the taxa corresponding to segments \textit{a}, \textit{b}, \textit{c}, \textit{d}, \textit{o} and \textit{s}). The taxon ID assigned to \textit{q} is the lowest common ancestor of taxa in \textit{M}. (e) Partially resolved segment subtree at node R, which is implied by distances obtained in (c) and (d), where the exact position of some segments (\textit{a}, \textit{b}, \textit{c} and \textit{d} with dashed branches) is left unresolved by the RPA}\label{fig:taxatortk_rpa}
\end{figure}

\subsubsection{Results}\label{results}

We evaluated the performance of taxonomic assignment with
\emph{taxator-tk} for different datasets: (a) 7176 16S rRNA genes, (b)
simulated short sequences of length 100, 500 and 1000 bp, (c) simulated
contigs for a synthetic microbial community and two public benchmark
datasets and (d) contigs of a microbial community from cow rumen. When
possible, we applied cross-validation and evaluated different taxonomic
distances between sample and reference taxa. In all cases, the reference
data were a diverse collection of full and partial genome sequences with
taxonomic annotation. As expected, performance for 16S marker genes was
best because it contained a clear phylogenetic signal. In practice, such
sequences are best classified using phylogenetic placement because it
makes use of reference phylogenies. The second evaluation with
nucleotide sequences resembling individual reads, which were sampled
from 1729 different species, showed that precision was high even for
short sequences, but about 10\% lower on average than for 16S data. The
recall increased with the length of the sequences. Therefore, it is
recommended to assemble reads prior to assignment with
\emph{taxator-tk}. For the validation with assembled contigs, we
compared our results to other state-of-the-art assignment methods:
\emph{CARMA}, \emph{MEGAN}, \emph{Kraken} (all similarity-based) and
PhyloPythiaS (composition-based). For the newly simulated community
consisting of 49 different species and the two benchmark datasets,
\emph{taxator-tk} misassigned substantially fewer contigs at species and
genus levels, resulting in a much better precision but a reduced recall.
\emph{PhyloPythiaS}, a classifier based on nucleotide composition
(\(k\)-mers), had the best recall in a specific usage scenario. For the
319 Mb cow rumen dataset, \emph{taxator-tk} was most consistent in
assigning 2 kb sub-sequences to taxonomic bins, which confirmed the
previous results on simulated contigs. In summary, \emph{taxator-tk}
also predicted the most realistic number of taxa in the samples compared
to the other programs. Considering the runtime, it was slower than
\emph{Kraken} and \emph{MEGAN}, due to additional computations, but
faster than \emph{CARMA} due to the efficient and parallel
implementation: we processed \(\sim\) 6 Gb per day using 10 CPU cores,
including the initial local alignment step. The segmentation procedure
of \emph{taxator-tk} accounted for a 30 \% decrease of the overall
runtime and the program scaled approximately linearly with the input
data size.

\begin{figure}[htbp]
\centering
\includegraphics{figure/publication_taxatortk/figure_4.pdf}
\caption[Family-level bin precision for the simulated metagenome sample with 49 species]{Family-level bin precision for the simulated metagenome sample with 49 species (simArt49e). (a-c) Each family bin’s assignment precision related to logarithmic bin size for seven cross-validation experiments with simArt49e. The results of the single experiments were added to assess the taxonomic assignment performance across a range of evolutionary distances between the query and the reference sequences, excluding the least abundant bins (1\% of total bp). We calculated the precision values for (a) CARMA3, (b) MEGAN4 and (c) taxator-tk, counting assignments to lower-ranking taxa at the family level, and added a smoothed k-nearest-neighbor estimate of the mean precision in R using wapply (width=0.3) followed by smooth.spline (df=10). CARMA3 and MEGAN4 incorrectly identified many small taxonomic bins, substantially more than taxator-tk. (d) gives the amount of correct, false and undetermined family-level assignments for the different classifiers with simArt49e.}\label{fig:taxatortk_precision}
\end{figure}

\subsubsection{Discussion}\label{discussion}

For all compared methods, the bin precision decreased with the bin size.
Throughout all validation experiments, we could show that
\emph{taxator-tk} was the most precise method in assigning metagenome
nucleotide sequences to corresponding taxa among the compared methods
(an example shown in Figure~\ref{fig:taxatortk_precision}), which also
resulted in the most realistic number of taxa. However, it assigned
fewer data overall than other methods. This trade-off is a direct
implication of the algorithm design, which is tailored towards
minimization of errors. Therefore, it can confidently assign a core of
sequences, for instance to train a model using nucleotide composition or
to estimate taxon abundances. The use of unstructured reference data
allows assigning across all domains of life, in contrast to most methods
using specific gene families. From a methodological point of view, we
presented an alternative phylogenetic inference algorithm which runs in
linear time with respect to the number of homologs, and which applies to
any nucleotide sequences with no need to select algorithm parameters.
Besides taxonomic annotation of metagenomes, it can be applied to any
DNA or RNA sequence, for instance to detect contamination in isolate
sequencing data.

\FloatBarrier

\newpage

\subsection{\texorpdfstring{A probabilistic model for genome recovery
(\emph{MGLEX})}{A probabilistic model for genome recovery (MGLEX)}}\label{sec:synopsis_mglex}

The corresponding article in Section~\ref{sec:full_mglex} describes a
probabilistic model for use in metagenome binning. Such likelihood
models are at the core of many popular algorithms, including sequence
classification and clustering. While some models exist as fixed parts of
contig clustering programs, we developed a new modular, stand-alone and
reusable model using a large set of input features. This model is based
on parameterized submodels for which maximum likelihood (ML) parameter
estimates can be inferred. Besides classification and clustering, we
demonstrate alternative applications such as sample size reduction and
visualization. The method is available as an open-source Python library
and command line program called MGLEX.

\subsubsection{Introduction}\label{introduction-1}

Shotgun sequencing of a microbial community bypasses the need to obtain
pure cultures and thus enables novel insights into ecosystems, in
particular for those genomes that are inaccessible by cultivation. Since
current metagenome assemblies are oftentimes highly fragmented, a
process called binning sorts assembled sequences (contigs) according to
the underlying genomes. Various programs were written to bin
metagenomes, using different methodologies and sequence features. These
comprise classification and clustering by consideration of \(k\)-mer
distributions (nucleotide composition), sequence similarity (homology)
and assembly read coverage (genome copy number). Coverage information
can be very powerful for separating genomes, if multiple samples with
varying genome copies are sequenced and co-assembled. However, with a
limited number of samples, it remains difficult to reconstuct
high-quality bins down to the strain level. Here, we propose a model for
metagenome binning, using probabilities to represent natural
uncertainty. The model aggregates explicit submodels for read coverage,
nucleotide composition and contig similarity to reference sequences (via
taxonomic annotation). This design incorporates knowledge about the
feature generation process in each submodel, which leads to a robust fit
when few data are available. In contrast, other methods frequently apply
a data-driven transform before clustering with a single, e.g.~Gaussian,
model. Our implementation \emph{MGLEX} does not represent an automatic
binning solution but a flexible framework for genome recovery.

\subsubsection{Methods}\label{methods-1}

A classification model is trained to distinguish data of different
classes. In probabilistic modeling, training means to determine the
model parameters (\(\theta\)) from example data for a set of different
classes. Here, classes correspond to different genomes which make part
of a metagenome and the data to be classified are contigs. Hence, we
need to provide training sequences for each genome before we can
classify unknown contigs.

Let \(1\le i\le D\) be an index referring to \(D\) contigs resulting
from a shotgun metagenomic experiment. For the \(i\)\textsuperscript{th}
contig, we define a joint likelihood for genome bin \(g\)
(Equation~\ref{eq:mglex_likelihood_aggregate}), which is a weighted
product over \(M\) independent submodels likelihoods for the different
feature types. For the \(k\)\textsuperscript{th} submodel,
\(\bm{\mathit{\Theta_k}}\) is the corresponding parameter vector,
\(\bm{F_{i,k}}\) the feature vector of the \(i\)\textsuperscript{th}
contig and \(\alpha_k\) defines the contribution of the respective
submodel or feature type. \(\beta\) is a free scaling parameter to
adjust the smoothness of the aggregate likelihood distribution over the
genome bins (bin posterior).

\begin{equation}
\mathcal{L}(\mathbf{\Theta_g} \mid \mathbf{F_i})
= \left( \prod_{k=1}^M \mathcal{L}(\bm{\mathit{\Theta_{gk}}} \mid \bm{F_{ik}})^{\alpha_k} \right)^\beta
\label{eq:mglex_likelihood_aggregate}\end{equation}

The model assumes statistical independence of the submodel features. All
model parameters are determined from training data, \(\mathbf{\Theta}\)
using submodel ML estimation, \(\bm \alpha\) using the inverse standard
deviations of the class log-likelihood distributions
(Figure~\ref{fig:mglex_alpha_inference}) and \(\beta\) by mean squared
error (MSE) minimization (Figure~\ref{fig:mglex_beta_fitting}).

\begin{figure}[htbp]
\centering
\includegraphics{figure/publication_mglex/main_alpha-inference.pdf}
\caption[Submodel weighting using $\alpha_k$]{Procedure for determination of $\alpha_k$ for each submodel. The figure shows a schematic for a single genome and two submodels. The genome’s contig log-likelihood distribution is scaled to a standard deviation of one before adding the term in the aggregate model.}\label{fig:mglex_alpha_inference}
\end{figure}

\begin{figure}[htbp]
\centering
\includegraphics{figure/publication_mglex/main_beta-fitting.pdf}
\caption[Training and test error as a function of $\beta$]{Model training (err) and test error (Err) as a function of $\beta$ for the complete aggregate model including all submodels and feature types. The solid curve shows the average and the colored shading the standard deviation of the three partitions in cross-validation. The corresponding optimal values for $\beta$ are marked by black dots and vertical lines. The minimum average training error is 0.238 ($\beta=2.85$) and test error is 0.279 at $\beta=1.65$.}\label{fig:mglex_beta_fitting}
\end{figure}

We integrate different submodels
\(\mathcal{L}(\bm{\mathit{\Theta_k}} \mid \bm{F_{i,k}})\) according to
distinct input feature types:

\begin{itemize}
\tightlist
\item
  a Poisson model for absolute read coverage considering multiple
  samples
\item
  a Binomial model for relative read coverage considering multiple
  samples
\item
  a frequency model for \(k\)-mers
\item
  a set of layered frequency models for taxonomic annotation of contigs
\end{itemize}

The layered frequency model is an adjustment of the standard frequency
model for hierarchical labels because the taxonomy represents a
tree-like structure (Figure~\ref{fig:mglex_hnbayes_tree}). The listed
submodels are kept simple and make feature independence assumptions to
simplify calculations.

\begin{figure}[htbp]
\centering
\includegraphics{figure/publication_mglex/main_tree.pdf}
\caption[Simplified taxonomy]{Taxonomy stucture simplified to four levels and eight nodes. A full taxonomy may consist of thousands of nodes. Each taxonomy level uses a frequency model which is assumed independent of the remaining levels.}\label{fig:mglex_hnbayes_tree}
\end{figure}

We simulated a metagenome (400 genomes with strain heterogeneity) and
created short contigs (1 kb) to validate and demonstrate the aggregate
model. Differential abundances were produced by simulating Illumina
reads (150 bp) for a primary lognormal and three secondary abundance
distributions and by mapping the resulting reads to the contigs,
introducing typical biases but omitting the actual read assembly. For
each genome, we obtained 300 kb of contig data and calculated the read
coverage, \(5\)-mer frequencies and taxonomic annotations as features
for the model.

\subsubsection{Results}\label{results-1}

Using the simulated metagenome, we applied three-fold cross-validation
and checked how well the model classified contigs to the most likely
genome (ML) with different combinations of input features. Genome
abundance turned out to be the weakest single feature type while
taxonomic annotation from local alignment to reference genome sequences
was the strongest. However, the aggregation of submodels according to
Equation~\ref{eq:mglex_likelihood_aggregate} yielded better performance
in all cases. In summary, about 68\% of contig pairs, which were not
used for model training, were classified to the same genome using the
full set of available submodels. Considering species-level bins, this
value increased to 79\%, which showed that the model had difficulties to
distinguish strains of the same species using the differential abundance
values stemming from only four samples in our simulation. The error
decreased further when applying soft (not ML) classification, fitting
the parameter \(\beta\) (Figure~\ref{fig:mglex_beta_fitting}), because
each contig could then belong to several genomes with varying class
posterior probability. When the model was used to refine the genome bins
from two popular automatic binning programs, the quality (adjusted Rand
index) improved for both of these programs.

We demonstrated alternative model applications besides classification.
Using the likelihood distributions in the training data, we calculated
\emph{p}-values, which indicates how extreme a particular contig
likelihood is with respect to the training data. With sufficient
training data (100 kb in our example), we used the \emph{p}-value to
enrich a metagenome sample \emph{in-silico} for a specific genome, so
that irrelevant contigs were removed and the overall sample size was
reduced. On average, a critical \emph{p}-value of 2.5\% led to a sample
size reduction of 95\%. Such a filtering step may be useful for a more
focused analysis or to apply a method with otherwise prohibitive
runtime. As a second application example, we derived a probabilistic
measure to quantify the similarity between any two genomes or genome
bins. The quantity is based on a relative mixture likelihood and may be
used to cluster bins hierarchically and to analyze the similarity
structure of genome bins (Figure~\ref{fig:mglex_tree_bin_comparison}).
In particular, the method indicates whether the resolution of individual
bins is justified with respect to the model and contig data.

\begin{figure}[htbp]
\centering
\includegraphics{figure/publication_mglex/main_bin-similarity.pdf}
\caption[Average linkage clustering of genomes using probabilistic distances]{Average linkage clustering of a random subset of 50 out of 400 genomes using probabilistic distances to analyze bin resolution. This example compares the left (blue) tree, which was constructed only with nucleotide composition and taxonomic annotations, with the right (red) tree, which uses all available features. The tip labels were shortened to fit into the figure. The similarity axis is scaled logarithmically to focus on values close to one. Bins which are more than 50\% similar branch in the outermost ring whereas highly dissimilar bins branch close to the center.}\label{fig:mglex_tree_bin_comparison}
\end{figure}

\subsubsection{Discussion}\label{discussion-1}

We described an aggregate likelihood model with applications in
metagenome binning, for instance classification, genome enrichment and
visualization. It builds on specific submodels, each responsible for
different feature types. The modular design helps to improve the model
and to compute and interpret the results. In comparison to previous
methods, we added two new submodels. The first is a binomial model for
relative differential read coverage over multiple samples to account for
systematic read mapping biases and the second is a layered frequency
model for taxonomic annotation, which allows considering external
knowledge from reference sequences for sequence binning. We also
proposed a new weighting scheme to combine the information of several
submodels. The reference implementation called \emph{MGLEX} in its
current state lacks support for parallel computations, which will be
added later. As the runtime for all submodel ML parameter estimations
and sequence classification is linear in the number of contigs,
embedding it into clustering algorithms such as the Expectation
Maximization (EM) or Markov Chain Monte Carlo (MCMC) algorithms is also
feasible. We hope to continue developing the open-source package
\emph{MGLEX} as a flexible framework for metagenome analysis and
binning, to be integrated into programs and workflows.

\FloatBarrier

\newpage

\subsection{Further works}\label{sec:synopsis_furtherworks}

The published methods in this thesis were validated using data
simulations and sampling from reference genome sequences. Nevertheless,
their use must be shown when applied to a variety of real metagenomes.
The program \emph{taxator-tk} was subsequently applied in two metagenome
studies in completely different settings. For the publication by
Bulgarelli et al.
(\protect\hyperlink{ref-BulgarelliStructure2015}{2015}), taxonomic
profiles were generated for metagenome contigs to study complex
microbial communities associated with plant roots (rhizosphere). The
taxonomic profiles where shown to be consistent with profiles based on
independent 16S amplicon sequencing for the same communities.
Furthermore, \emph{taxator-tk} was able to discover members of clades,
for instance Archaea and Cyanobacteria, which the 16S primers seemed to
have missed in the amplification step. Such biases for the primers used
to amplify regions of the 16S gene were also independently confirmed
(Eloe-Fadrosh et al.,
\protect\hyperlink{ref-EloefadroshMetagenomics2016}{2016}). The
taxonomic profiles based on shotgun metagenome data were also not
influenced by 16S copy number variations in the corresponding genomes,
unlike the amplicon profiles. In a second study of a benzene-degrading
enrichment community (Dong et al.,
\protect\hyperlink{ref-DongReconstructing2017}{2017}), \emph{taxator-tk}
was applied to derive bin-specific sequence data to train a full model
for the composition-based classifier PhyloPythiaS (Patil et al.,
\protect\hyperlink{ref-PatilTaxonomic2011}{2011}), so that the genomes
of four species could be recovered, two of them with over 97\%
completeness. Thereby, we used the same logic to define the model and to
seed the genome bins with training data as in the program PhyloPythiaS+
(Gregor et al., \protect\hyperlink{ref-GregorPhylopythias2016}{2016}),
but we replaced the homology search based on marker genes with
\emph{taxator-tk}, which offered a better coverage of genomic reference
for this task. The completeness and potential contamination levels of
the derived genomes were checked independently, based on single-copy
marker genes and the near-complete genomes were then used to study
benzene degradation pathways by linkage to metabolomic experiments and
to propose a benzene oxidation pathway with direct sulfate reduction.

Working with metagenomic data and comparing the results of different
binning programs, for instance in (Dröge, Gregor \& McHardy,
\protect\hyperlink{ref-DrogeTaxatortk2014}{2014}), we observed that the
current metagenome analysis toolbox features many programs for similar
problems giving different results. One possible explanation is that
metagenomics is an interdisciplinary field with contributions from
biotechnology, ecology and medicine, each with a different focus on
ecosystems and data (see Figure~\ref{fig:metagenomes_environments}). As
a result, metagenomics lacks a systematic and cross-discipline view on
software for data processing and analysis. To improve the situation, the
\href{http://cami-challenge.org/}{Critical Assessment of Metagenomic
Interpretation (CAMI) challenge (http://cami-challenge.org)} compared
computer programs for metagenome analysis, such as metagenome assembly,
taxonomic profiling and genome binning. As part of my thesis work, I
contributed both by taking part in the conception and implementation of
the binning evaluation framework as well as by submitting
\emph{taxator-tk} for comparison (Sczyrba et al.,
\protect\hyperlink{ref-SczyrbaCritical2017}{2017}).

\newpage

\section{Summary of results}\label{summary-of-results}

The taxonomic annotation program \emph{taxator-tk} was shown to obtain
very high precision on a number of synthetic and real metagenomes by
applying phylogenetic principles. It requires similar reference genome
sequences to calculate a phylogenetic neighborhood for annotation. In
its initial stage, the provided example workflow has the option to use
two different search programs, but the local aligner is exchangeable in
order to adapt to sequence data which stem from different experimental
procedures. Within the core algorithm (RPA), which is based on pairwise
alignment of partial sequences (segments), \emph{taxator-tk} neither
relies on exact scores from the local aligner nor on a complete set of
retrieved homologs and there are no related parameters to be set. The
RPA was recently adapted to amino acid sequences, so that direct protein
alignment can be used for the similarity search without the need to
back-translate similarity matches to the nucleotide level. For example,
some alternative local alignment programs for identification of similar
sequences have been presented lately, which claim to improve the search
time by fast protein alignment with a reduced alphabet (Zhao, Tang \&
Ye, \protect\hyperlink{ref-ZhaoRapsearch22011}{2011}; Huson \& Xie,
\protect\hyperlink{ref-HusonPoor2014}{2014}; Hauswedell, Singer \&
Reinert, \protect\hyperlink{ref-HauswedellLambda2014}{2014}; Buchfink,
Xie \& Huson, \protect\hyperlink{ref-BuchfinkFast2014}{2014}). Another
advantage of \emph{taxator-tk} is its independence of curated reference
data, in contrast to the standard procedures in phylogenetic analysis
using precomputed HMMs or gene families. This comes at the cost of an
increased computation time for de-novo phylogenetic structure detection
but enables \emph{taxator-tk} to be applied in less frequent,
non-standard situations, for example to analyze communities with
eukaryotic content, like algae or fungi.

The probabilistic model for metagenome binning and its software
implementation \emph{MGLEX} make use of many available sequence features
to classify contigs to genomes or genomes bins, and we exemplified
alternative applications such as genome enrichment and bin analysis. We
could also show on benchmark data that the application of the model
improved on the results from recent automatic binning procedures, which
confirmed our initial incentive to make better use of the available data
to recover individual genomes. The model itself is very generic so that
it can, in theory, also be applied to non-metagenomic datasets. We
designed \emph{MGLEX} as a subroutine for use in other software to
maximize the benefits resulting from future improvements. It should be
integrated into more user-friendly applications for genome recovery.

In the conception stage of both methods, we considered that the
algorithms scale with large datasets and that they solve well-defined
problems. As a commitment to open science, we released the program
source codes to the public and used simple and well specified data
formats wherever possible. The software ought to be flexible enough to
keep pace with the future progress both in experimental protocols and
sequencing technologies.

The two methods in this thesis extend available software for analyzing
metagenomes. From a methodological perspective, these methods cover
several algorithmic fields including sequence alignment, phylogenetics
and probabilistic modeling. Each of the articles published in the course
of the thesis follows the track to improve on the understanding of
metagenomic data. While the binning review (Dröge \& McHardy,
\protect\hyperlink{ref-DrogeTaxonomic2012}{2012}), see Appendix C, gave
an extensive introduction to the different metagenome binning and
analysis approaches, the first method article in
Section~\ref{sec:full_taxator-tk}, (Dröge, Gregor \& McHardy,
\protect\hyperlink{ref-DrogeTaxatortk2014}{2014}) presented the program
\emph{taxator-tk}, which enables precise taxonomic annotation of entire
metagenomes by fast calculation of phylogenetic neighborhoods. The
second method article in Section~\ref{sec:full_mglex}, (Dröge, Schönhuth
\& McHardy, \protect\hyperlink{ref-DrogeProbabilistic2017}{2017})
proposed a statistical classification framework to recover genomes from
shotgun-sequenced metagenomes. Applied studies used \emph{taxator-tk}
and demonstrated its utility to inform about taxonomic composition
(Bulgarelli et al.,
\protect\hyperlink{ref-BulgarelliStructure2015}{2015}) and to
reconstruct near-complete genomes for a simple community (Dong et al.,
\protect\hyperlink{ref-DongReconstructing2017}{2017}). Finally, a
comprehensive comparison of metagenome processing software was conducted
as a challenge (Sczyrba et al.,
\protect\hyperlink{ref-SczyrbaCritical2017}{2017}) to improve on the
overall interpretation of metagenome studies.

\newpage

\section{Conclusions and outlook}\label{conclusions-and-outlook}

Metagenomics as a discipline has matured in the course of this thesis,
with regard to nucleotide sequencing, metagenome assembly and
computational analysis. For instance, paired read insert libraries and
long-read technologies allow assembling larger fractions of metagenomes.
The subsequent assembly of metagenomes, which differs from isolate
genome assembly, is considered an important task and led to the
development of dedicated algorithms. The interest in medical
applications has been continuously increasing in metagenomics so that
analyzing the human gut microbiome and its impact on human health has
become a common procedure. Several large-scale projects reflect an
increased interest in different areas, for instance the
\href{http://hmpdacc.org/}{Human Microbiome Project
(http://hmpdacc.org)}, \href{http://www.metahit.eu/}{MetaHIT
(http://www.metahit.eu)} or the
\href{http://www.earthmicrobiome.org/}{Earth Microbiome Project
(http://www.earthmicrobiome.org)}, which all seek to collect data using
standardized protocols and analysis methods.

For the analysis of metagenomics data, the impact of algorithms on the
overall conclusions may not be underestimated, as most of the data is
directly or indirectly produced by computer programs. Each specific
procedure may be sensitive to the applied software pipeline and the
results may, for example, differ in the number and abundance of OTUs,
the quality of assembled genome sequences and the robustness to
particular experimental details such as sequencing errors. For the
multitude of methods which have been developed over the past years,
including the methods presented in this thesis, it is still to determine
under which conditions they should, or should not, be applied and how
they compare to other methods which claim to solve similar problems.
Therefore, in addition to developing new methods, rigorous testing is
required to provide a more complete picture of the metagenomic software
landscape for the scientific community. In the course of the CAMI
initiative, we noted that software accessibility represents an important
factor, among others like code quality and program (re-)usability, in
order to enable systematic testing and reproducibility of results.
Future compliance of academic software with these criteria will
therefore be an important factor for a better assessment of programs and
their results.

This thesis and the methods presented here contribute to the field by
providing some base-level metagenome analysis tools. They implement new
theoretical approaches and are accessible for evaluation and application
as open-source. Both \emph{taxator-tk} and \emph{MGLEX} are also
suitable to assess the quality of metagenome assemblies and binning from
various environments. In the near future, the aim will be to recover
high-quality genomes in an automatic way. This target may soon be
reached, not only by algorithmic improvements in metagenomics but also
by combination with new experimental techniques and further progress in
sequencing technology. For instance, single-molecule sequencing can
eliminate problems in metagenome processing, which are associated with
the short read length. Single cell sequencing is another complementary
technique which allows assembling genomes from very limited numbers of
microbial cells (Lasken \& McLean,
\protect\hyperlink{ref-LaskenRecent2014}{2014}; Gawad, Koh \& Quake,
\protect\hyperlink{ref-GawadSinglecell2016}{2016}), which need to be
isolated but not grown in medium. The combination of data from
single-cell and metagenome sequencing can improve genome reconstructions
(Mende et al., \protect\hyperlink{ref-MendeImproved2016}{2016}; Bremges
et al., \protect\hyperlink{ref-BremgesMecors2016}{2016}).

\chapter{Report of Publications}\label{report-of-publications}

This chapter lists the publications to which I contributed in the course
of this thesis. My attribution to the individual works is reported as
percentage estimates (5\% ranges) and a short description of the
contributions.

\section{Central publications}\label{central-publications}

These are the publications of the developed methods on which this
cumulative thesis is based.

\begin{longtable}[]{@{}ll@{}}
\toprule
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Title}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
\emph{Taxator-tk}: Precise Taxonomic Assignment of Metagenomes by Fast
Approximation of Evolutionary Neighborhoods\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Journal}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
Bioinformatics\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Published}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
10 November 2014\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Authors}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
Johannes Dröge, Ivan Gregor, Alice C. McHardy\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{DOI}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
\href{https://doi.org/10.1093/bioinformatics/btu745}{10.1093/bioinformatics/btu745}\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Contributions}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
Designed method, developed software, designed experiments, conducted
experiments, wrote manuscript\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Attribution}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
71\% to 75\%\strut
\end{minipage}\tabularnewline
\bottomrule
\end{longtable}

\begin{longtable}[]{@{}ll@{}}
\toprule
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Title}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
A Probabilistic Model to Recover Genomes in Shotgun Metagenomics\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Journal}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
PeerJ\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Published}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
17 December 2016 (preprint), 22 May 2017\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Authors}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
Johannes Dröge, Alexander Schönhuth, Alice C. McHardy\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{DOI}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
\href{https://doi.org/10.7717/peerj-cs.117}{10.7717/peerj-cs.117}\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Contributions}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
Designed method, developed software, designed experiments, conducted
experiments, wrote manuscript\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Attribution}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
86\% to 90\%\strut
\end{minipage}\tabularnewline
\bottomrule
\end{longtable}

\section{Related publications}\label{related-publications}

These publications relate to metagenome binning or for which the
developed software was applied to analyze metagenomes.

\begin{longtable}[]{@{}ll@{}}
\toprule
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Title}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
Taxonomic binning of metagenome samples generated by next-generation
sequencing technologies\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Journal}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
Briefings in Bioinformatics\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Published}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
31 July 2012\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Authors}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
Johannes Dröge, Alice C. McHardy\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{DOI}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
\href{https://doi.org/10.1093/bib/bbs031}{10.1093/bib/bbs031}\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Contributions}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
Collected information, wrote manuscript\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Attribution}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
50\%\strut
\end{minipage}\tabularnewline
\bottomrule
\end{longtable}

\begin{longtable}[]{@{}ll@{}}
\toprule
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Title}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
Structure and Function of the Bacterial Root Microbiota in Wild and
Domesticated Barley\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Journal}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
Cell Host \& Microbe\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Published}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
11 March 2015\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Authors}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
Davide Bulgarelli, Ruben Garrido-Oter, Philipp C. Münch, Aaron Weimann,
Johannes Dröge, Yao Pan, Alice C. McHardy, Paul Schulze-Lefert\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{DOI}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
\href{https://doi.org/10.1016/j.chom.2015.01.011}{10.1016/j.chom.2015.01.011}\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Contributions}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
Assembled metagenome, analyzed metagenome, added to manuscript, reviewed
manuscript\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Attribution}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
1\% to 5\%\strut
\end{minipage}\tabularnewline
\bottomrule
\end{longtable}

\begin{longtable}[]{@{}ll@{}}
\toprule
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Title}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
PhyloPythiaS+: a self-training method for the rapid reconstruction of
low-ranking taxonomic bins from metagenomes\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Journal}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
PeerJ\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Published}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
8 February 2016\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Authors}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
Ivan Gregor, Johannes Dröge, Melanie Schirmer, Christopher Quince, Alice
C. McHardy\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{DOI}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
\href{https://doi.org/10.7717/peerj.1603}{10.7717/peerj.1603}\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Contributions}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
Contributed to method design, reviewed manuscript\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Attribution}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
1\% to 5\%\strut
\end{minipage}\tabularnewline
\bottomrule
\end{longtable}

\begin{longtable}[]{@{}ll@{}}
\toprule
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Title}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
Reconstructing metabolic pathways of a member of the genus Pelotomaculum
suggesting its potential to oxidize benzene to carbon dioxide with
direct reduction of sulfate.\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Journal}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
FEMS Microbiology Ecology\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Published}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
23 December 2016\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Authors}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
Xiyang Dong, Johannes Dröge, Christine von Toerne, Sviatlana Marozava,
Alice C. McHardy, Rainer U. Meckenstock\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{DOI}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
\href{https://doi.org/10.1093/femsec/fiw254}{10.1093/femsec/fiw254}\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Contributions}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
Annotated metagenome, created phylogenetic trees of genes, added to
manuscript, reviewed manuscript\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Attribution}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
16\% to 20\%\strut
\end{minipage}\tabularnewline
\bottomrule
\end{longtable}

\begin{longtable}[]{@{}ll@{}}
\toprule
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Title}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
Critical Assessment of Metagenome Interpretation − a benchmark of
computational metagenomics software\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Journal}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
Nature Methods\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Published}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
12 June 2017 (preprint), 2 October 2017\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Authors}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
Alexander Sczyrba, Peter Hofmann, Peter Belmann, David Koslicki, Stefan
Janssen, Johannes Dröge, Ivan Gregor, Stephan Majda, Jessika Fiedler,
Eik Dahms, Andreas Bremges, Adrian Fritz, Ruben Garrido-Oter, Tue
Sparholt Jørgensen, Nicole Shapiro, Philip D Blood, Alexey Gurevich,
Yang Bai, Dmitrij Turaev, Matthew Z DeMaere, Rayan Chikhi, Niranjan
Nagarajan, Christopher Quince, Fernando Meyer, Monika Balvočiūtė, Lars
Hestbjerg Hansen, Søren J Sørensen, Burton K H Chia, Bertrand Denis,
Jeff L Froula, Zhong Wang, Robert Egan, Dongwan Don Kang, Jeffrey J
Cook, Charles Deltel, Michael Beckstette, Claire Lemaitre, Pierre
Peterlongo, Guillaume Rizk, Dominique Lavenier, Yu-Wei Wu, Steven W
Singer, Chirag Jain, Marc Strous, Heiner Klingenberg, Peter Meinicke,
Michael D Barton, Thomas Lingner, Hsin-Hung Lin, Yu-Chieh Liao,
Genivaldo Gueiros Z Silva, Daniel A Cuevas, Robert A Edwards, Surya
Saha, Vitor C Piro, Bernhard Y Renard, Mihai Pop, Hans-Peter Klenk,
Markus Göker, Nikos C Kyrpides, Tanja Woyke, Julia A Vorholt, Paul
Schulze-Lefert, Edward M Rubin, Aaron E Darling, Thomas Rattei, Alice C
McHardy\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{DOI}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
\href{https://doi.org/10.1038/nmeth.4458}{10.1038/nmeth.4458}\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Contributions}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
Contributed to data simulation design, curated reference data,
contributed to evaluation methods, contributed to framework design,
added to manuscript, reviewed manuscript\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Attribution}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
1\% to 5\%\strut
\end{minipage}\tabularnewline
\bottomrule
\end{longtable}

\section{Other publications}\label{other-publications}

\begin{longtable}[]{@{}ll@{}}
\toprule
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Title}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
Bioboxes: standardised containers for interchangeable bioinformatics
software.\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Journal}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
GigaScience\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Published}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
15 October 2015\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Authors}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
Peter Belmann, Johannes Dröge, Andreas Bremges, Alice C. McHardy,
Alexander Sczyrba, Michael D. Barton\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{DOI}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
\href{https://doi.org/10.1186/s13742-015-0087-0}{10.1186/s13742-015-0087-0}\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Contributions}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
Contributed to design, created template containers, reviewed
manuscript\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.20\columnwidth}\raggedright\strut
\textbf{Attribution}\strut
\end{minipage} & \begin{minipage}[t]{0.75\columnwidth}\raggedright\strut
6\% to 10\%\strut
\end{minipage}\tabularnewline
\bottomrule
\end{longtable}

\chapter{\texorpdfstring{\emph{Taxator-tk}: Precise Taxonomic Assignment
of Metagenomes by Fast Approximation of Evolutionary
Neighborhoods}{Taxator-tk: Precise Taxonomic Assignment of Metagenomes by Fast Approximation of Evolutionary Neighborhoods}}\label{sec:full_taxator-tk}

\textbf{J. Dröge}\textsuperscript{1,2}, \textbf{I.
Gregor}\textsuperscript{1,2} and \textbf{A. C.
McHardy}\textsuperscript{1,3*}

\textsuperscript{1}Department for Algorithmic Bioinformatics, Heinrich
Heine University, Universitätsstraße 1, 40225 Düsseldorf, Germany

\textsuperscript{2}Max-Planck Research Group for Computational Genomics
and Epidemiology, Max-Planck Institute for Informatics, University
Campus E1 4, 66123 Saarbrücken, Germany

\textsuperscript{3}Computational Biology of Infection Research,
Helmholtz Center for Infection Research, Inhoffenstraße 7, 38124
Braunschweig, Germany

\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}

\textbf{This is an author-produced version of an article accepted for
publication in
\emph{\href{https://bioinformatics.oxfordjournals.org}{Bioinformatics}}
following peer review. This version has been adapted to the thesis
layout. The original open-access article is accessible by DOI
\href{https://doi.org/10.1093/bioinformatics/btu745}{10.1093/bioinformatics/btu745}.}

\section{Abstract}\label{abstract}

\subsection{Motivation}\label{motivation}

Metagenomics characterizes microbial communities by random shotgun
sequencing of DNA isolated directly from an environment of interest. An
essential step in computational metagenome analysis is taxonomic
sequence assignment, which allows identifying the sequenced community
members and reconstructing taxonomic bins with sequence data for the
individual taxa. For the massive datasets generated by next-generation
sequencing technologies, this cannot be performed with de-novo
phylogenetic inference methods. We describe an algorithm and the
accompanying software, taxator-tk, which performs taxonomic sequence
assignment by fast approximate determination of evolutionary neighbors
from sequence similarities.

\subsection{Results}\label{results-2}

Taxator-tk was precise in its taxonomic assignment across all ranks and
taxa for a range of evolutionary distances and for short as well as for
long sequences. In addition to the taxonomic binning of metagenomes, it
is well suited for profiling microbial communities from metagenome
samples because it identifies bacterial, archaeal and eukaryotic
community members without being affected by varying primer binding
strengths, as in marker gene amplification, or copy number variations of
marker genes across different taxa. Taxator-tk has an efficient,
parallelized implementation that allows the assignment of 6 Gbp of
sequence data per day on a standard multiprocessor system with ten CPU
cores and microbial RefSeq as the genomic reference data.

\subsection{Availability}\label{availability}

Taxator-tk source and binary program files are publicly available at
\url{http://algbio.cs.uni-duesseldorf.de/software/}.

\section{Introduction}\label{introduction-2}

Metagenomics allows us to study microbial communities from natural
environments without the need to obtain pure cultures of the individual
member species (Hugenholtz,
\protect\hyperlink{ref-HugenholtzExploring2002}{2002}; Riesenfeld,
Schloss \& Handelsman,
\protect\hyperlink{ref-RiesenfeldMetagenomics2004}{2004}). The shotgun
sequencing of microbial community DNA with current techniques generates
reads that range from less than 100 to several thousand nucleotides
(Dröge \& McHardy, \protect\hyperlink{ref-DrogeTaxonomic2012}{2012};
Klumpp, Fouts \& Sozhamannan,
\protect\hyperlink{ref-KlumppNext2012}{2012}). By computational analyses
of metagenome sequence samples, we can estimate the abundances of
different taxa for the sampled communities, known as taxonomic
profiling, characterize their functional and metabolic potential based
on the predicted proteins and resolve the contributions of individual
taxa to the latter by reconstructing \enquote{bins} of unassembled or
assembled sequences that originate from the same taxon.

A taxonomic profile of a microbial community can be inferred by either
targeted amplification and sequencing of taxonomic marker genes or from
metagenome shotgun datasets (Lindner \& Renard,
\protect\hyperlink{ref-LindnerMetagenomic2013}{2013}; Sunagawa et al.,
\protect\hyperlink{ref-SunagawaMetagenomic2013}{2013}; Silva et al.,
\protect\hyperlink{ref-SilvaFocus2014}{2014}). Most metagenome profiling
methods classify reads based on predefined taxon-specific (Segata et
al., \protect\hyperlink{ref-SegataMetagenomic2012}{2012}) or
\enquote{universal} marker genes (Darling et al.,
\protect\hyperlink{ref-DarlingPhylosift2014}{2014}), or directly
estimate a taxonomic profile for the underlying microbial community from
their k-mer composition (Koslicki, Foucart \& Rosen,
\protect\hyperlink{ref-KoslickiQuikr2013}{2013}). Frequently used
phylogenetic placement programs within such frameworks are pplacer
(Matsen, Kodner \& Armbrust,
\protect\hyperlink{ref-MatsenPplacer2010}{2010}) or EPA/RAxML (Berger,
Krompass \& Stamatakis,
\protect\hyperlink{ref-BergerPerformance2011}{2011}), which both operate
in a probabilistic framework to place a query gene sequence in a
pre-computed reference phylogeny of a particular gene family. If this
gene tree is an approximate representation of the respective species
tree -- or reference taxonomy -- this can be used to assign a taxon
identifier (ID) to the query sequence (Matsen, Kodner \& Armbrust,
\protect\hyperlink{ref-MatsenPplacer2010}{2010}; Stark et al.,
\protect\hyperlink{ref-StarkMltreemap2010}{2010}). Taxon abundances are
then derived from the individual read counts or gene frequencies within
each taxonomic group.

Binning methods place the sequences of a shotgun metagenome sample into
bins representing the different taxa of the sampled microbial community.
If a bin represents a low-ranking taxon, such as species, then the set
of reads or contigs of an individual taxonomic bin serves as a
draft-genome reconstruction for a community member (Pope et al.,
\protect\hyperlink{ref-PopeIsolation2011}{2011}). Binning methods are
either based on clustering or classification. Clustering methods group
sequences into bins without consideration of external reference
sequences or taxonomic information. Instead, bins are inferred based on
similarities in GC content, oligomer frequencies, the abundance of genes
or contig coverage within one or multiple samples (Baran \& Halperin,
\protect\hyperlink{ref-BaranJoint2012}{2012}; Carr, Shen-Orr \&
Borenstein, \protect\hyperlink{ref-CarrReconstructing2013}{2013};
Albertsen et al., \protect\hyperlink{ref-AlbertsenGenome2013}{2013};
Alneberg et al., \protect\hyperlink{ref-AlnebergBinning2014}{2014}), or
by using a combination of these (Iverson et al.,
\protect\hyperlink{ref-IversonUntangling2012}{2012}). This allows draft
genome recovery from deep lineages for sequences of sufficient length.
Taxonomic binning, like profiling, uses the resemblance of a sequence to
known taxa in either global sequence composition or local sequence
similarity to assign a taxon ID. For the human gut microbiome, extensive
genome sequencing of isolate cultures allowed species-level taxonomic
binning for a substantial portion (approx. 40\%) of a metagenome sample
(Schloissnig et al.,
\protect\hyperlink{ref-SchloissnigGenomic2013}{2013}) by mapping the
reads to isolate genome sequences, which exist for many abundant species
{[}Sunagawa et al.
(\protect\hyperlink{ref-SunagawaMetagenomic2013}{2013}). However, this
procedure is not suitable for environments in which most species are
from deep-branching lineages without available reference genome
sequences. Taxonomic binning of these requires more sophisticated
similarity-based or composition-based taxonomic assignment methods
(McHardy et al., \protect\hyperlink{ref-MchardyAccurate2007}{2007};
Brady \& Salzberg, \protect\hyperlink{ref-BradyPhymmbl2011}{2011}; Huson
et al., \protect\hyperlink{ref-HusonIntegrative2011}{2011}). Taxonomic
binning by sequence composition also allows draft genome recovery from
deep-branching lineages, based on limited amounts of sequences for the
individual taxa (McHardy et al.,
\protect\hyperlink{ref-MchardyAccurate2007}{2007}). Composition-based
programs achieve linear classification times regarding metagenome sample
size, while similarity-based binning methods require considerably more
computational resources for sequence similarity searches in large
reference sequence collections. Programs with a focus on processing
large amounts of raw sequencing reads, such as Kraken (Wood \& Salzberg,
\protect\hyperlink{ref-WoodKraken2014}{2014}), implement the fastest
search strategies. Similarity-based programs are more accurate for the
assignment of sequences shorter than 1 kbp (Patil et al.,
\protect\hyperlink{ref-PatilTaxonomic2011}{2011}).

A common aim in taxonomic profiling and taxonomic binning is the
identification of known taxa from a sample. A taxonomic profiler
estimates a taxonomic abundance profile for the entire sample, which can
be inferred by analyzing a smaller number of marker genes, though one
needs to account for variations in gene copy numbers for taxon-specific
markers (Lindner \& Renard,
\protect\hyperlink{ref-LindnerMetagenomic2013}{2013}). Taxonomic binning
assigns taxon IDs to a large portion of the sample sequences for
subsequent functional and metabolic analyses of individual taxon bins.
In addition, one can generate a taxonomic profile by quantifying the
assigned reads, based on read counts or coverage for each individual
bin.

From a methodological standpoint, the differences between the
phylogenetic-placement-based methods for profiling and
alignment-score-based methods for taxonomic binning and profiling, such
as MEGAN (Huson et al.,
\protect\hyperlink{ref-HusonIntegrative2011}{2011}), CARMA3 (Gerlach \&
Stoye, \protect\hyperlink{ref-GerlachTaxonomic2011}{2011}) or SOrT-ITEMS
(Monzoorul Haque et al.,
\protect\hyperlink{ref-MonzoorulhaqueSortitems2009}{2009}) are that the
latter lack a well-motivated evolutionary framework. However, they have
the advantages of being computationally lightweight and applicable to
arbitrary genes, which is a necessity for taxonomic binning.
Phylogenetic-placement-based methods cannot currently be used for
binning, because the de-novo inference of trees for gene families on a
metagenome-wide scale is computationally too demanding, particularly for
next-generation sequencing (NGS) data.

Our taxator toolkit (taxator-tk) is a software package for the taxonomic
sequence assignment in shotgun metagenomics with applications to both
profiling and binning. Conceptually, it lies between sequence similarity
based programs which use local sequence alignment scores and those using
trees. Taxator-tk extends the alignment score based approach by
approximating phylogenetic gene trees and thereby provides more accurate
taxonomic assignments, without assuming universal, rank or
clade-specific gene conservation levels as parameters. We improve in
terms of applicability to large data sets compared to phylogenetic
methods by assigning genomic sequences without the computationally
demanding steps of de-novo multiple sequence alignment (MSA) and tree
inference. Taxator-tk determines a subset of homologs, which represent
the approximate evolutionary neighbors for a query sequence, by a linear
number of pairwise sequence comparisons with regard to the number of
considered homologs and then assigns a taxon ID using a reference
taxonomy based on the taxonomic IDs of these neighbors. We have
furthermore reduced the run-time by limiting the analysis to distinct
homology-supported regions of the query sequence, which we termed query
segmentation. Our open-source (GPLv3) software can be applied to
arbitrary nucleotide sequences, such as reads, contigs, scaffolds and
complete genomes sequences. It can be downloaded from
http://algbio.cs.uni-duesseldorf.de/software/.

\section{Methods}\label{methods-2}

\subsection{Taxator-tk's workflow for taxonomic
assignment}\label{taxator-tks-workflow-for-taxonomic-assignment}

The workflow for the taxonomic assignment of a nucleotide query sequence
comprises three stages (Figure~\ref{fig:publication_taxator-tk_workflow}
a--c). The first stage uses a local sequence aligner to identify similar
regions from a reference sequence collection, such as microbial RefSeq
(mRefSeq) (Sayers et al.,
\protect\hyperlink{ref-SayersDatabase2009}{2009}). The implemented
workflows currently use BLAST+ (Camacho et al.,
\protect\hyperlink{ref-CamachoBlast2009}{2009}) version 2.2.28+ using
any of the blastn, megablast or tblastx algorithms and nucleotide LAST
(Frith, Hamada \& Horton,
\protect\hyperlink{ref-FrithParameters2010}{2010}) version 320. Other
aligners can be used via conversion to a TAB-separated format, if found
to be more appropriate. We discuss our choice of the aligner in the
Supplementary Methods (\enquote{IX. Sequence Homology Search via Local
Alignment}). At the beginning of the taxator algorithm in stage two,
overlapping regions on the query, each defined by local alignment to a
nucleotide reference sequence, are merged into larger subsequences
called segments (Supplementary Fig. 1). These query segments are flanked
by regions without similarity to any reference data (Supplementary Fig.
2) and are not considered further. This step reduces the overall number
of positions in the following alignment computations and improves the
taxonomic assignment of queries that have undergone genome
rearrangements, resulting in a different order of these segments. The
reference sequence regions corresponding to the local alignments are
extended at both sides by the missing number of nucleotides to match to
the corresponding query segment with respect to its length and we refer
to these as reference segments. Each independent set of homologous
segments is the input to the core algorithm in the program taxator in
stage two (Figure~\ref{fig:publication_taxator-tk_workflow} b), which
calculates independent taxon IDs for every corresponding query segment.

\begin{figure}[htbp]
\centering
\includegraphics{figure/publication_taxatortk/figure_1.pdf}
\caption[Workflow diagram for taxator-tk]{Workflow diagram for the taxonomic assignment of a nucleotide query sequence with taxator-tk. Taxonomic assignment with taxator-tk includes three steps. (a) Homology search for query sequence in reference collection using a nucleotide local alignment program. (b) Program taxator splits the query into distinct segments and determines a taxon ID for each using the corresponding homologs. (c) Program binner determines a taxon ID for the entire query based on the taxonomic assignments of the individual segments.}\label{fig:publication_taxator-tk_workflow}
\end{figure}

In the third stage (Figure~\ref{fig:publication_taxator-tk_workflow} c),
multiple segments belonging to the same query are considered and their
IDs are combined in the program binner, to derive a consensus taxon ID.
The corresponding algorithm weights the individual segment assignments
by the number of identical bases to the closest reference sequence and
assigns to the entire query the taxon ID supported by the majority
(default = 70\% identical bases) of weighted assignments with a minimum
number of identical bases (default = 50 bp) (Supplementary Methods,
\enquote{II. Consensus Binning Algorithm}). Binner has the optional
parameters minimum sequence identity and minimum sample abundance, but
these were not applied in our analysis. If the taxonomic information is
limited or contradictory, taxator and binner assign identifiers to
higher ranking taxa in a conservative fashion to obtain the most
reliable taxonomic assignments.

\subsection{The taxonomic assignment algorithm
(taxator)}\label{the-taxonomic-assignment-algorithm-taxator}

The input to the algorithm is a segment \emph{q} of the original query
sequence from an (unknown) taxon Q and a set of homologous segments with
known taxon IDs. The term \enquote{segment} refers to a gap-less
subsequence of either the query or a reference sequence. Given that for
the set of homologs we know the correct underlying species tree of taxa
(Figure~\ref{fig:publication_taxator-tk_rpa} a), we can see that for our
query taxon Q, the closest evolutionary neighbors would be A, B and S.
If we simply assign X, the parental taxon of A, B and S, as a taxon
identifier, this would be inaccurate, as A, B and S are more closely
related to each other than to Q. Instead, the correct taxonomic
assignment would be a parent of X and Q, and of at least one additional
outgroup taxon (O) in the reference tree, such that Q also becomes a
descendant of the identified parent (R in
Figure~\ref{fig:publication_taxator-tk_rpa} a). If we therefore identify
the taxa A, B, S and O in the reference tree, we can determine the taxon
ID of R as the lowest common ancestor (LCA) of these taxa and assign it
to Q (and \emph{q}).

Assuming that the underlying segment tree for a set of homologs is
similar to the species tree, a natural procedure to identify the
segments corresponding to the leaf taxa within R among the homologs
would be to construct a MSA for the segment and a phylogenetic tree with
a corresponding subtree as in
Figure~\ref{fig:publication_taxator-tk_rpa} a. However, the
computational effort for this approach is superlinear with respect to
the number of homologs being compared and substantial for all the query
segments in a large sample, even using fast techniques for MSA
construction and tree inference. The taxator algorithm attempts to
identify these segments with a linear number of pairwise segment
comparisons. Let us consider an undirected graph in which nodes
represent the segments (tree leaves) and edge lengths the evolutionary
distances between pairs of segments within the underlying tree
(Figure~\ref{fig:publication_taxator-tk_rpa} b). In this graph, a
monophyletic group in the species tree is a subgraph. For all pairs of
subgraph nodes, the following inequality is true, given that the
segments have evolved with a constant rate of evolution (i.e.~the
segment tree is ultrametric): The distance between any two subgraph
nodes is smaller than that to any other node outside the subgraph. The
relationship becomes clearer when thinking of the evolutionary distance
between two nodes as the divergence time from their most recent
ancestor. Members of a monophyletic group derive from a single common
ancestor and thus there is a maximum distance for all possible pairs. If
one member's distance to an outside node is smaller than this maximum,
both must share a more recent common ancestor and the corresponding
group is not monophyletic by definition. The stated inequality can be
used to augment an incomplete group or corresponding subgraph
iteratively by taking an internal distance, ideally close to the
maximum, as a threshold and adding outside nodes to the group which have
a smaller distance to some internal node.

In this manner, taxator-tk searches for the leaf node taxa of clade R
among all segments based on a linear number of sequence comparisons
between the input segments and adds them to an empty working set M:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{-1}
\tightlist
\item
  A ranking by alignment scores from the input local alignments is used
  at the beginning to identify the reference segment s that is most
  similar to the query \emph{q}. The working set \emph{M} is then
  augmented in two passes:
\item
  In the first pass, all segments are aligned to s using fast nucleotide
  alignment and the edit distance. These scores in the following serve
  to approximate the evolutionary distances in the underlying segment
  phylogeny. All segment taxa with a distance less than or equal to the
  threshold \emph{distance(s,q)} are added to \emph{M}
  (Figure~\ref{fig:publication_taxator-tk_rpa} c).
\item
  The outgroup segment o is determined as the first segment for which
  \emph{distance(s,o)} is larger than \emph{distance(s,q)}. In the
  second pass, all segments are then aligned to o and segment taxa with
  distances smaller than or equal to \emph{distance(o,q)} are added to
  \emph{M} as well (Figure~\ref{fig:publication_taxator-tk_rpa} d).
\end{enumerate}

This procedure requires approximately \(2n\) alignments, where \(n\) is
the number of reference segments.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{2}
\tightlist
\item
  The resulting set \emph{M} of taxa (implicit in the partially resolved
  tree in Figure~\ref{fig:publication_taxator-tk_rpa} e) is used to
  determine the taxon ID for \emph{q}, corresponding to the LCA of these
  taxa in a reference taxonomy, such as the NCBI taxonomy.
\end{enumerate}

If no outgroup could be determined or if \emph{M} is so diverse that the
LCA corresponds to the taxonomy root, \emph{q} is left unassigned. The
algorithm requires at least two homologous segments (\emph{s} and
\emph{o}) to determine a meaningful taxon ID. The taxa in \emph{M}
become more diverse if the alignment scores are inaccurate ultrametric
distance estimates, if the species subtree's topology deviates from the
respective part of the taxonomy or if the gene tree's topology deviates
from the species tree, for instance due to varying rates of evolution or
the inclusion of non-homologous segments in the analysis. The robustness
of the algorithm in avoiding incorrect assignments under these
circumstances relies on the number of taxa in \emph{M} and the
subsequent LCA operation. Further details relating to the robustness of
the implementation are given in the Supplementary Methods, \enquote{I.
Taxonomic Assignment of Sequence Segments}.

\begin{figure}[htbp]
\centering
\includegraphics{figure/publication_taxatortk/figure_2.pdf}
\caption[Realignment placement algorithm steps]{Algorithm for taxonomic labeling of query segments (realignment placement algorithm/RPA). The RPA assigns a taxon ID to a query segment \textit{q}. (a) Species reference tree with query taxon Q and reference taxa A, B, C, D, O and S. This will be approximated by the segment phylogenetic tree for the query segment and homologous segments of reference taxa. (b) Approximate graph representing pairwise distances between the taxa. The subgraph for clade X is highlighted. (c,d) Show the two alignment passes which add segment taxa to an (empty) set \textit{M}. Segment \textit{s} is the segment with the smallest local alignment score (distance) to \textit{q} in the initial similarity search. (c) First, all segments are aligned to segment \textit{s}. The resulting distances are ordered and the taxa with equal or smaller distances than \textit{distance(s,q)} are added to \textit{M}. The outgroup segment, here \textit{o}, is the next most similar segment to \textit{s} after \textit{q}, with \textit{distance(o,s)} > \textit{distance(s,q)}. (d) All segments are aligned to \textit{o}. From the ranked distances, taxa with distances smaller than \textit{distance(o,q)} are also added to \textit{M}. Thus, \textit{M} includes all the nearest evolutionary neighbors for the query segment \textit{q} (the taxa corresponding to segments \textit{a}, \textit{b}, \textit{c}, \textit{d}, \textit{o}, \textit{s}). The taxon ID then assigned to \textit{q} is the lowest common ancestor in the reference species tree (reference taxonomy) of these taxa in \textit{M}. (e) Partially resolved segment subtree at node R that is implied by distances obtained in (c) and (d), where the exact position of some segments (\textit{a}, \textit{b}, \textit{c}, \textit{d}; dashed branches) is left unresolved by the RPA.}\label{fig:publication_taxator-tk_rpa}
\end{figure}

\subsection{Evaluation procedures}\label{evaluation-procedures}

Before evaluating any method, we removed the smallest predicted bins
(1\%) as likely errors. We used the macro-precision and macro-recall as
measures of assignment performance (Supplementary Methods,
\enquote{Performance measures}). The macro-precision specifies the
fraction of correct assignments per predicted bin (precision), averaged
over all such bins, while the macro-recall measures the fraction of
correctly recovered sequence data per truly existing bin (recall),
averaged over all such bins. To account for strong differences in bin
size, we also pooled the species, genus and family assignments, and
reported the overall precision for these ranks as the total fraction of
correct assignments. We tested the assignment performance of different
methods using three simulated short read datasets, simulated 16S rRNA
data, three simulated metagenome contig datasets and using assembled cow
rumen metagenome contigs. For every simulated dataset, we performed
seven cross-validation experiments (Supplementary Methods, \enquote{VII.
Cross-validation}). In each experiment, we simulated a specific minimum
taxonomic distance between a query sequence and the reference sequences.
For the first experiment, all reference data, including the species
genome data from which the query had been sampled, were made available
to the method for assigning a single query sequence as an idealized test
case. In the other six scenarios, all reference data belonging to the
species, genus, family, order, class or phylum of the query sequence,
respectively, were made inaccessible for the method in
leave-one-taxon-out cross-validation experiments. We summarized the
sequence assignments from these experiments to characterize a method's
assignment performance across the entire range of taxonomic distances.
For evaluation with the cow rumen metagenome sample, for which no true
taxonomic labels were known, we divided the assembled contigs into
multiple sequence \enquote{chunks} and characterized the consistency of
taxonomic assignments for chunks originating from same contig
(Supplementary Methods, \enquote{VIII. Consistency Analysis}).

\section{Results}\label{results-3}

\subsection{Evaluation with unassembled
data}\label{evaluation-with-unassembled-data}

We first evaluated the performance of taxator-tk for classification of
the most widely used taxonomic marker in bacterial diversity studies --
the 16S rRNA gene (Supplementary Material, Supplementary Fig. 3). This
served as a proof of concept, as taxator-tk classifies arbitrary
sequence regions including taxonomic marker genes. We did not expect it
to perform better than sophisticated phylogenetic models for this task,
but wanted to confirm a satisfactory performance. The macro-precision
for the taxonomic assignment of 7176 16S rRNA genes (Supplementary Fig.
4) was constantly above 92\% (Supplementary Fig. 3a) in the combined
cross-validation (Methods), using the whole-genome reference sequences
in mRefSeq47 (Supplementary Fig. 5), not just the 16S genes. More
precisely, the average error rate per bin (one minus precision) was
7.4\% at the species level and 4.6\% at the order level.

Next, we simulated 100,000 reads at 100, 500 and 1000 bp by subsampling
randomly from 1729 species in mRefSeq47 and evaluated taxator-tk with
these three datasets using the (combined) cross-validation experiments.
The performance was very similar for the different fragment sizes
(Supplementary Fig. 6-8a). Overall, taxator-tk showed high precision in
simulated read assignment: The macro-precision for all short read
lengths remained above 74\% and was 82--99\% for the genus to kingdom
ranks, about 10\% lower on average than for the 16S data. This was still
good for the assignment of short sequence fragments from arbitrary
genomic regions compared to a marker gene. At genus level, the
macro-recall was 19--23\% (\textasciitilde{}33\% genera recovered) if
genome sequences of the same species as the query sequence were provided
in the reference (Supplementary Fig. 6-8b) and as low as 5--7\%
(\textasciitilde{}16\% genera recovered) otherwise (Supplementary Fig.
6-8c). The macro-recall depends on the availability of related reference
data at the respective ranks. It decreases when removing reference data
for cross-validation. For example, if all reference data at genus level
are removed, then no correct assignments to the genus rank are possible.
For lower taxonomic ranks, the macro-recall was also low due to the
large number of sample taxa and their uneven representation caused by
the taxonomic bias towards a few abundant phyla in mRefSeq47. The longer
reads had a slightly higher macro-recall than the shorter ones. Since
longer sequences yield better recall and because overlapping reads
contain redundant information, leading to more alignment computations,
we recommend applying taxator-tk to (partially) assembled data. For
longer query sequences, we were more likely to find segments for
processing and therefore to assign a larger portion of the sample.

\subsection{Evaluation with simulated metagenome
contigs}\label{evaluation-with-simulated-metagenome-contigs}

For our tests on three simulated contig samples, we compared taxator-tk
to CARMA3 and MEGAN4/5 using the same taxonomy and the same nucleotide
alignments against mRefSeq54 (Supplementary Fig. 9). Additionally, we
applied these three methods to two datasets using protein-level
alignments which we inferred using BLAST+/tblastx. When doing so, we
used the programs recommended parameter settings (Supplementary Methods,
\enquote{X. Program Parameters and Versions}) and cross-validation, as
before (Supplementary Methods, \enquote{V. Cross-validation}).

We created a simulated NGS metagenome dataset (simArt49e, composition in
Supplementary Fig. 10) for our evaluation. This sample includes 49
equally abundant species (51 strains) and was created by Illumina paired
read simulation with pIRS (Hu et al.,
\protect\hyperlink{ref-HuPirs2012}{2012}), followed by SOAPdenovo
version 1.05 (Luo et al.,
\protect\hyperlink{ref-LuoSoapdenovo22012}{2012}) assembly. Around 160
Mbp or 267,178 contigs remained after removal of 0.03\% chimeric
sequences. In the combined cross-validation with this dataset
(Supplementary Fig. 11--13a), taxator-tk produced substantially fewer
errors: Sequence assignments to species, genus and family were 91\%
correct for taxator-tk, compared to 52\% for CARMA3 and 59\% for MEGAN4.
Accordingly, taxator-tk showed the highest macro-precision of all
methods, e.g.~61\% at the species level, compared to 3\% (CARMA3) and
5\% (MEGAN4). The low macro-precision observed for CARMA3 and MEGAN4 is
largely due to the prediction of many small bins with many false
assignments (Supplementary Methods, \enquote{V. Performance Measures}).
The majority of assignments were to Bacteria, Archaea, or undetermined
in the case of CARMA3, because we restricted the availability of similar
reference sequences in each of the individual cross-validations, which
we then jointly assessed.

When only the sequences from the corresponding species and genus were
removed from the reference (new genus scenario, Supplementary Fig.
11--13d), taxator-tk was also the most precise, though it had a lower
recall than the other methods (taxator-tk: 56\% family macro-precision,
60\% overall precision for species to family, 10\% family macro-recall;
CARMA3: 13\%, 27\% and 20\%; MEGAN4: 22\%, 27\% and 31\%). Differences
in assignment precision were also evident in the number of predicted
taxon bins: For instance, when simulating novel families (Supplementary
Fig. 11--13e), many more species bins were predicted by CARMA3 (1672)
and MEGAN4 (824) than by taxator-tk (65), with 49 species being present
in the sample. Similarly, MEGAN4 predicted 69 orders, CARMA3 81 and
taxator-tk 27, compared to the existing 32 orders in simArt49e
(Figure~\ref{fig:publication_taxator-tk_simulated}). Overall, taxonomic
assignments of taxator-tk were more rarely to false taxa at low ranks
than with the other methods, and instead were to higher-ranking correct
taxa. The other two methods assigned a substantial amount of sequence
data incorrectly to bins at the family level or below. This can be
seriously misleading if the results were to be used to estimate species
diversity or to reconstruct genomes. Therefore, taxator-tk is better
suited for taxonomic profiling in addition to its primary task -- the
recovery of individual taxonomic sequence bins from shotgun datasets.

To investigate the reason for the observed differences between overall
and macro-precision, which reflect variations in assignment precision
for bins of different sizes, we plotted the per-bin precision at the
family level in the combined cross-validation, as a function of
predicted bin size with a k-nearest-neighbor (kNN) estimate of
macro-precision (Figure~\ref{fig:publication_taxator-tk_precision}; see
Supplementary Fig. 14 for all ranks). Overall, the bins predicted by
taxator-tk were smaller, more precise and much more likely to represent
truly existing taxa than those predicted by the other programs although
larger bins tended to be more accurate for all methods. CARMA3 and
MEGAN4 predicted a substantial number of mostly smaller-sized incorrect
bins. Although the size-dependent kNN precision curves at large bin
sizes is unaffected by these small bins, the curves remained below 70\%
(CARMA3) or 80\% (MEGAN4), whereas the taxator-tk curve reached almost
100\%. For the smallest bins, taxator-tk's kNN precision was
\textasciitilde{}20\% whereas bins below 500 kbp for CARMA3 and MEGAN4
were practically indistinguishable from noise. This shows that the high
macro-precision with taxator-tk is not only due to a lower frequency of
falsely predicted bins, but also due to a substantially higher precision
for the large bins.

Next, we performed cross-validation on the FAMeS (Mavromatis et al.,
\protect\hyperlink{ref-MavromatisUse2007}{2007}) SimMC/AMD
(\textasciitilde{}17 Mbp/7307 contigs) and SimHC/soil
(\textasciitilde{}17 Mbp/7307 contigs) simulated metagenome datasets.
These contigs were assembled from simulated Sanger (not NGS) reads and
represent considerably smaller samples than those which are generated
with the current NGS technologies (Dröge \& McHardy,
\protect\hyperlink{ref-DrogeTaxonomic2012}{2012}). We also measured the
methods' performance on these data for a direct comparison to previous
works. As before, taxator-tk had the highest macro-precision and the
most realistic number of predicted taxon bins (Supplementary Fig. 15,
16; Supplementary Methods \enquote{XII. FAMeS Cross-validation}).

For the contig assignments of the composition-based program PhyloPythiaS
(Patil et al., \protect\hyperlink{ref-PatilTaxonomic2011}{2011}), we
could not apply cross-validation, due to the computational effort of
training many models. Therefore we adopted the published evaluation
scenario from (Patil et al.,
\protect\hyperlink{ref-PatilTaxonomic2011}{2011}), in which all genome
sequences of the SimMC genera were removed from the reference genome
sequence before classifying the contigs. All programs were provided with
the remaining sequenced genomes and an additional 100 kbp of reference
data for each of the three dominant strains. The latter could be used by
PhyloPythiaS to infer a corresponding species model, but were less
helpful for the similarity-based classifiers. We generated assignments
with taxator-tk, CARMA3 and MEGAN4/5 under equivalent conditions, once
with nucleotide and once separately with protein local alignments, and
compared them to both Kraken and the published PhyloPythiaS assignments
(Supplementary Fig. 21). The performance and error distributions for the
similarity-based programs (Supplementary Fig. 21c-d) were consistent
with our previous evaluations with SimMC. MEGAN4 and MEGAN5 produced
almost identical results. Using protein local alignments, we observed a
moderate increase in overall species to family precision for MEGAN5 and
CARMA3, while taxator-tk improved in macro-recall. Notably, taxator-tk
showed the best macro-precision of all similarity-based programs and all
ranks, regardless of which alignment kind was used. Kraken produced most
errors and the lowest macro-precision, because it assigned almost
exclusively at species level. This would make it generally unsuitable in
situations where sequences of closely related genomes are unavailable.
However, it had a comparatively high macro-recall up to the order level.

\begin{figure}[htbp]
\centering
\includegraphics{figure/publication_taxatortk/figure_3.pdf}
\caption[Comparison of three classifiers for a simulated metagenome sample with 49 species]{Comparison of three classifiers for a novel-family simulation using a simulated metagenome sample (simArt49e) with 49 species. CARMA3, MEGAN4 and taxator-tk: The outer ring with red background shading shows family-level assignments for all orders included in the simulated data set. These are all false in the chosen evaluation scenario, as no data from the families of the query sequences were included in the reference collection in the leave-one-taxon-out cross-validation experiments. Clearly, taxator-tk had the fewest assignments at family level, demonstrating its high precision in assignments. Assignments at inner rings, grey background shading, can be correct in principle, demonstrating at which taxonomic ranks the different methods tend to make their assignments, with taxator-tk tending towards producing higher-ranking assignments, as a trade-off for the high precision.}\label{fig:publication_taxator-tk_simulated}
\end{figure}

Assignment with PhyloPythiaS showed that composition-based
classification, when supplied with limited amounts of additional
training data from the relevant species, correctly assigned most data at
the genus and family levels (species assignments were not assessed in
the original publication), which were either rarely assigned by
taxator-tk or mostly incorrectly assigned by CARMA3, MEGAN and Kraken.
However, PhyloPythiaS predicted only 6 families compared to 29
underlying families, versus 43 (Kraken), 14/18 (taxator-tk), 50/32
(CARMA3) and 17/18 (MEGAN5) with nucleotide or protein alignments,
respectively. PhyloPythiaS had the highest macro-recall. The
macro-precision (\textasciitilde{}50\% for genus, family and order
level) was also higher than for Kraken (\textasciitilde{}4-13\%), MEGAN
(\textasciitilde{}7--31\%) or CARMA3 (\textasciitilde{}7--48\%) but less
than for taxator-tk (\textasciitilde{}32--68\%). However, unlike for the
other programs, the modeled taxa for PhyloPythiaS should be specified a
priori to achieve optimal performance. It is therefore best applied when
the taxonomic composition of a microbial community has already been
determined and sufficient training data are available for the identified
taxa.

\begin{figure}[htbp]
\centering
\includegraphics{figure/publication_taxatortk/figure_4.pdf}
\caption[Family-level bin precision for the simulated metagenome sample with 49 species]{Family-level bin precision for the simulated metagenome sample with 49 species (simArt49e). (a-c) Each family bin’s assignment precision related to logarithmic bin size for seven cross-validation experiments with simArt49e. The results of the single experiments were added to assess the taxonomic assignment performance across a range of evolutionary distances between the query and the reference sequences, excluding the least abundant bins (1\% of total bp). We calculated the precision values for (a) CARMA3, (b) MEGAN4 and (c) taxator-tk, counting assignments to lower-ranking taxa at the family level, and added a smoothed k-nearest-neighbor estimate of the mean precision in R using wapply (width=0.3) followed by smooth.spline (df=10). CARMA3 and MEGAN4 incorrectly identified many small taxonomic bins, substantially more than taxator-tk. (d) gives the amount of correct, false and undetermined family-level assignments for the different classifiers with simArt49e.}\label{fig:publication_taxator-tk_precision}
\end{figure}

\subsection{Evaluation with real metagenome
contigs}\label{evaluation-with-real-metagenome-contigs}

For microbial communities in many environments, only distantly related
reference genome sequences are available. We analyzed a medium complex
metagenome sample of such a microbial community from cow rumen (Hess et
al., \protect\hyperlink{ref-HessMetagenomic2011}{2011}) with taxator-tk,
CARMA3, MEGAN4/5 and PhyloPythiaS (the general model with the 100 most
abundant species among sequenced prokaryotes). We considered scaffolds
to be less reliable than contigs, which we reconstructed by splitting
the available scaffolds at gaps of more than 200 positions (A. Sczyrba,
personal communication). We subsequently divided contigs longer than 10
kbp into sequence \enquote{chunks} of 2 kbp, resulting in a 319 Mbp
dataset, which we used to assess the assignment consistency for chunks
originating from the same contig. The chunk sequences were assigned with
taxator-tk, CARMA3, MEGAN (given identical nucleotide/protein
alignments), Kraken and PhyloPythiaS. As the standard of truth for each
contig, we determined the taxon minimizing the inconsistency between all
corresponding chunk assignments (Gregor et al.,
\protect\hyperlink{ref-GregorPhylopythias2014}{2014}, unpublished) for
each method independently. A chunk assignment was considered consistent,
if it was to the same taxon as the one for entire contig, and
inconsistent otherwise. The consistency of a taxonomic bin is the
fraction of chunk sequences with matching contig assignments and the
macro-consistency is the consistency averaged over all predicted taxa,
similar to the macro-precision.

In agreement with the results for the simulated metagenome datasets, the
taxator-tk results were the most consistent among all tested methods,
regardless of the alignment type (Supplementary Fig. 22): 76--89\%
macro-consistency at species to order level, in comparison to MEGAN
(34--40\%), CARMA (0-55\%), Kraken (32--35\%) and PhyloPythiaS
(56--65\%). The overall consistency (analogous to overall precision) for
species to family levels was 97/97\% with taxator-tk, 39/48\% with
CARMA3, 62/64\% with MEGAN (nucleotide/protein-level), 42\% with Kraken
and 82\% with PhyloPythiaS. Likewise, taxator-tk assigned less data at
species to family level, with a total of 13/12 Mbp being consistent
compared to CARMA3 (8/26 Mbp), MEGAN (42/47 Mbp), Kraken (19 Mbp) or
PhyloPythiaS (14 Mbp). The different methods again determined different
numbers of taxa: CARMA3 predicted 572/611 genera with a
macro-consistency of 53/31\%, MEGAN 264/203 genera (34/37\%), Kraken 661
(32\%), PhyloPythiaS 33 (63\%) and taxator-tk found 110/27 genera
(76/81\%). The high consistency values observed for taxator-tk indicate
that it is a precise taxonomic classifier for real metagenomic contigs.

\subsection{Run-time analyses}\label{run-time-analyses}

The run-time for the taxonomic metagenome assignment was measured as the
time to find homologs and to assign taxon IDs to all sequences. We
evaluated the run-times of all methods using the same set of alignments
generated with either BLAST or LAST. Thus the run-time for the initial
similarity search was identical for all methods. We determined the time
for the taxonomic assignment of simArt49e for all methods when
performing a cross-validation with families present in the test dataset
removed from the reference data
(Figure~\ref{fig:publication_taxator-tk_simulated}). This took two
minutes with Kraken (single CPU core and \textasciitilde{}100 GiB RAM),
one hour for MEGAN4 (interactive mode), 6 hours for taxator-tk
(\textasciitilde{}10 CPU cores) and almost a week for CARMA3
(\textasciitilde{}20 CPU cores). The parallelization of taxator-tk led
to a linear decrease in time with the number of CPU cores for up to 15
cores, which became sublinear for 20 cores or more (Supplementary Fig.
23). To provide a more specific estimate of the throughput of
taxator-tk, we aligned \textasciitilde{}1 Gbp of cow rumen sequence data
with BLAST against mRefSeq54 and assigned the data with taxator-tk on 10
CPU cores (AMD Opteron 6386 SE). We measured an average throughput of
5.9 Gbp per day for the combined alignment and taxonomic assignment
steps with this dataset. We also determined how our implementation
scaled for increasing input sequence lengths and reference exclusion
scenarios (Supplementary Fig. 24a). The run-time scaled approximately
linearly except when the same or very similar species were among the
reference genome sequences. In general, the greater the number of
similar sequences in the reference data, the longer taxator-tk's
run-time was for the alignment of longer sequence stretches with more
homologs. Simultaneously, we investigated the impact of the query
segmentation on taxator-tk's run-time (Supplementary Fig. 24b) and found
that it reduced the total run-time by up to 30\%.

\section{Discussion}\label{discussion-2}

Taxator-tk is a taxonomic assignment software package which generates
very precise taxonomic assignments with few errors for metagenome
shotgun sequences. To provide a fair comparison, we invested extensive
effort into ensuring that we evaluated all methods under identical
conditions with the same reference sequences, test datasets and
background taxonomies, using their recommended settings. We evaluated
taxator-tk on 16S gene sequences, on simulated short reads, with
simulated assembled contigs and with 2 kbp contig fragments from a real
cow rumen metagenome. For each simulated sample, we evaluated a wide
range of evolutionary distances between the query and reference
sequences using leave-one-taxon-out cross-validation. Taxator-tk was the
most precise of all tested methods with the most realistic number of
identified taxa overall. This property was very pronounced for lower
taxonomic ranks from species to family level. However, taxator-tk
assigned fewer data overall than other methods from species to family.
For the small assembled SimMC dataset, it assigned fewer data,
particularly in comparison to the composition-based classifier
PhyloPythiaS, when 100 kbp of data were provided for individual
community members to train species-level models. For the real cow rumen
dataset, taxator-tk was the most consistent in terms of classifying
multiple pieces of one contig. Our results consistently indicate that
taxator-tk's strength is its high precision of assignments, which allows
us to confidently assign a core of sample sequences and thereby to infer
the taxonomic composition of the community. In comparison to assignments
based on marker genes, it has the advantages that it makes assignments
across all domains of life and that corresponding abundance estimates
from shotgun sequences are less affected by copy number variations of
individual genes. Such shotgun estimates are also unaffected by PCR
primer amplification biases, unlike marker gene sequencing techniques,
and do not require high-quality reference gene phylogenies for marker
genes. We confirmed this by in depth analysis of six 15 Gbp shotgun
samples from the barley rhizosphere, where we applied taxator-tk to
characterize the taxonomic composition of Bacteria, Archaea and
Eukaryotes, which correlated with results from 16S rRNA profiling and
showed the most notable deviations for taxa known to be affected by
primer biases or having multiple copies of the 16S rRNA gene (Bulgarelli
et al., unpublished). To target draft genome reconstructions, the data
assigned to individual taxonomic bins by taxator-tk can be used as
training data for complementary approaches, such as composition-based
methods, or as independent information in combination with recently
proposed clustering methods using the abundance of genes or contigs
across multiple samples.

From a methodological point of view, we have introduced a method for the
fast approximation of the evolutionary neighborhood of a query sequence
with a run-time that increases linearly with the number of homologs. In
de-novo phylogenetic inference methods, the run-time increases at least
log-linearly with the number of homologs or they rely on time-consuming
optimizations of parameter-rich phylogenetic models, which generates
excessive computational requirements for the analysis of Gbp-sized NGS
samples. Our software provides an easy to use and scalable alternative
to taxonomic classification of marker genes that is applicable to any
nucleotide fragment. Unlike other similarity-based taxonomic classifiers
for shotgun data, our algorithm handles different degrees of sequence
conservation without preset or user-specified parameters such as
alignment scores (overall or per gene family) and without being
restricted to the analysis of a number of high-quality homologs with a
minimal length. At the same time, the inferred evolutionary neighborhood
is extended by the identification of an outgroup, leading to more
precise taxonomic assignments, while regions without detectable
taxonomic signal are instantly discarded. We post-process independent
taxonomic assignments of query segments to infer an assignment for the
entire query and do this using a majority vote algorithm with a few
robust default parameters. This computationally lightweight step can be
quickly repeated with other values for the majority and minimum support
parameters, if required. In addition to the algorithmic considerations
and other run-time optimizations, we implemented query sequence
segmentation and program parallelization, which allow large-scale data
analysis with a throughput of several Gbs per day on a standard
multiprocessor system.

The program's scope is also not limited to the taxonomic assignment of
metagenomes: It can be applied to any DNA or RNA sequence. For instance,
another successful in-house application is the detection of
contaminations in isolate sequencing data. Furthermore, the program
taxator within taxator-tk provides taxonomic information for individual
query segments (Supplementary Fig. 2, 25), which could be used to
identify assembly errors or regions acquired by lateral gene transfer.

\section{Acknowledgments}\label{acknowledgments}

Computational support and infrastructure was provided by the
\enquote{Centre for Information and Media Technology} (ZIM) at the
University of Düsseldorf (Germany).

\section{Funding}\label{funding}

The authors gratefully acknowledge funding by the Max-Planck society and
Heinrich Heine University Düsseldorf.

\chapter{A Probabilistic Model to Recover Genomes in Shotgun
Metagenomics}\label{sec:full_mglex}

\textbf{Johannes Dröge\textsuperscript{1}, Alexander
Schönhuth\textsuperscript{2}, Alice C. McHardy\textsuperscript{1*}}

\textsuperscript{1}Helmholtz Centre for Infection Research,
Braunschweig, Germany

\textsuperscript{2}Centrum Wiskunde \& Informatica, Amsterdam, The
Netherlands

\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}

\textbf{This is an author-produced version of an article under revision
in \emph{\href{https://peerj.com/computer-science/}{PeerJ Computer
Science}}. This article version has been adapted to the thesis layout.
The original open-access article is accessible by DOI
\href{https://doi.org/10.7287/peerj.preprints.2626}{10.7287/peerj.preprints.2626}.}

\section{Abstract}\label{abstract-1}

Shotgun metagenomics of microbial communities reveals information about
strains of relevance for applications in medicine, biotechnology and
ecology. Recovering their genomes is a crucial, but very challenging
step, due to the complexity of the underlying biological system and
technical factors. Microbial communities are heterogeneous, with
oftentimes hundreds of present genomes deriving from different species
or strains, all at varying abundances and with different degrees of
similarity to each other and reference data. We present a versatile
probabilistic model for genome recovery and analysis, which aggregates
three types of information that are commonly used for genome recovery
from metagenomes. As potential applications we showcase metagenome
contig classification, genome sample enrichment and genome bin
comparisons. The open source implementation MGLEX is available via the
\href{https://pypi.python.org/pypi/mglex/}{Python Package Index} and on
\href{https://github.com/hzi-bifo/mglex/}{GitHub} and can be embedded
into metagenome analysis workflows and programs.

\section{Introduction}\label{introduction-3}

Shotgun sequencing of DNA extracted from a microbial community recovers
genomic data from different community members while bypassing the need
to obtain pure isolate cultures. It thus enables novel insights into
ecosystems, especially for those genomes which are inaccessible by
cultivation techniques and isolate sequencing. However, current
metagenome assemblies are oftentimes highly fragmented, including
unassembled reads, and require further processing to separate data
according to the underlying genomes. Assembled sequences, called
contigs, that originate from the same genome are placed together in this
process, which is known as metagenome binning (Tyson et al.,
\protect\hyperlink{ref-TysonCommunity2004}{2004}; Dröge \& McHardy,
\protect\hyperlink{ref-DrogeTaxonomic2012}{2012}) and for which many
programs have been developed. Some are trained on reference sequences,
using contig \(k\)-mer frequencies or sequence similarities as sources
of information (McHardy et al.,
\protect\hyperlink{ref-MchardyAccurate2007}{2007}; Dröge, Gregor \&
McHardy, \protect\hyperlink{ref-DrogeTaxatortk2014}{2014}; Wood \&
Salzberg, \protect\hyperlink{ref-WoodKraken2014}{2014}; Gregor et al.,
\protect\hyperlink{ref-GregorPhylopythias2016}{2016}), which can be
adapted to specific ecosystems. Others cluster the contigs into genome
bins, using contig \(k\)-mer frequencies and read coverage (Chatterji et
al., \protect\hyperlink{ref-ChatterjiCompostbin2008}{2008}; Kislyuk et
al., \protect\hyperlink{ref-KislyukUnsupervised2009}{2009}; Wu et al.,
\protect\hyperlink{ref-WuMaxbin2014}{2014}; Nielsen et al.,
\protect\hyperlink{ref-NielsenIdentification2014}{2014}; Imelfort et
al., \protect\hyperlink{ref-ImelfortGroopm2014}{2014}; Alneberg et al.,
\protect\hyperlink{ref-AlnebergBinning2014}{2014}; Kang et al.,
\protect\hyperlink{ref-KangMetabat2015}{2015}; Lu et al.,
\protect\hyperlink{ref-LuCocacola2016}{2016}).

Recently, oftentimes multiple biological or technical samples of the
same environment are sequenced to produce distinct genome copy numbers
across samples, sometimes using different sequencing protocols and
technologies, such as Illumina and PacBio sequencing (Hagen et al.,
\protect\hyperlink{ref-HagenQuantitative2016}{2016}). Genome copies are
reflected by corresponding read coverage variation in the assemblies
which allows to resolve samples with many genomes. The combination of
experimental techniques helps to overcome platform-specific shortcomings
such as short reads or high error rates in the data analysis. However,
reconstructing high-quality bins of individual strains remains difficult
without very high numbers of replicates. Often, genome reconstruction
may improve by manual intervention and iterative analysis
(Figure~\ref{fig:binning_workflow}) or additional sequencing
experiments.

\begin{figure}[htbp]
\centering
\includegraphics{figure/publication_mglex/main_mglex-scheme.pdf}
\caption[Genome reconstruction workflow]{Genome reconstruction workflow. To recover genomes from environmental sequencing data, the illustrated processes can be iterated. Different programs can be run for each process and iteration. MGLEX can be applied in all steps: (a) to classify contigs or to cluster by embedding the probabilistic model into an iterative procedure; (b) to enrich a metagenome for a target genome to reduce its size and to filter out irrelevant sequence data; (c) to select contigs of existing bins based on likelihoods and p-values and to repeat the binning process with a reduced dataset; (d) to refine existing bins, for instance to merge bins as suggested by bin analysis.}\label{fig:binning_workflow}
\end{figure}

Genome bins can be constructed by consideration of genome-wide sequence
properties. Currently, oftentimes the following types of information are
considered:

\begin{itemize}
\tightlist
\item
  Read contig coverage: sequencing read coverage of assembled contigs,
  which reflects the genome copy number (organismal abundance) in the
  community. Abundances can vary across biological or technical
  replicates, and co-vary for contigs from the same genome, supplying
  more information to resolve individual genomes (Baran \& Halperin,
  \protect\hyperlink{ref-BaranJoint2012}{2012}; Albertsen et al.,
  \protect\hyperlink{ref-AlbertsenGenome2013}{2013}).
\item
  Nucleotide sequence composition: the frequencies of short nucleotide
  subsequences of length \(k\) called \(k\)-mers. The genomes of
  different species have a characteristic \(k\)-mer spectrum (Karlin,
  Mrazek \& Campbell,
  \protect\hyperlink{ref-KarlinCompositional1997}{1997}; McHardy et al.,
  \protect\hyperlink{ref-MchardyAccurate2007}{2007}).
\item
  Sequence similarity to reference sequences: a proxy for the
  phylogenetic relationship to species which have already been
  sequenced. The similarity is usually inferred by alignment to a
  reference collection and can be expressed using taxonomy (McHardy et
  al., \protect\hyperlink{ref-MchardyAccurate2007}{2007}).
\end{itemize}

Probabilities represent a convenient and efficient way to represent and
combine information that is uncertain by nature. Here, we

\begin{itemize}
\tightlist
\item
  propose a probabilistic aggregate model for binning based on three
  commonly used information sources, which can easily be extended to
  include new features.
\item
  outline the features and submodels for each information type. As the
  feature types listed above derive from distinct processes, we define
  for each of them independently a suitable probabilistic submodel.
\item
  showcase several applications related to the binning problem
\end{itemize}

A model with data-specific structure poses an advantage for genome
recovery in metagenomes because it uses data more efficiently for
fragmented assemblies with short contigs or a low number of samples for
differential coverage binning. Being probabilistic, it generates
probabilities instead of hard labels so that a contig can be assigned to
several, related genome bins and the uncertainty can easily be assessed.
The models can be applied in different ways, not just classification,
which we show in our application examples. Most importantly, there is a
rich repertoire of higher-level procedures based on probabilistic
models, including Expectation Maximization (EM) and Markov Chain Monte
Carlo (MCMC) methods for clustering without or with few prior knowledge
of the modeled genomes.

We focus on defining explicit probabilistic models for each feature type
and their combination into an aggregate model. In contrast, binning
methods often concatenate and transform features (Chatterji et al.,
\protect\hyperlink{ref-ChatterjiCompostbin2008}{2008}; Imelfort et al.,
\protect\hyperlink{ref-ImelfortGroopm2014}{2014}; Alneberg et al.,
\protect\hyperlink{ref-AlnebergBinning2014}{2014}) before clustering.
Specific models for the individual data types can be better tailored to
the data generation process and will therefore generally enable a better
use of information and a more robust fit of the aggregate model while
requiring fewer data. We propose a flexible model with regard to both
the included features and the feature extraction methods. There already
exist parametric likelihood models in the context of clustering, for a
limited set of features. For instance, Kislyuk et al.
(\protect\hyperlink{ref-KislyukUnsupervised2009}{2009}) use a model for
nucleotide composition and Wu et al.
(\protect\hyperlink{ref-WuMaxbin2014}{2014}) integrated distance-based
probabilities for 4-mers and absolute contig coverage using a Poisson
model. We extend and generalize this work so that the model can be used
in different contexts such as classification, clustering, genome
enrichment and binning analysis. Importantly, we are not providing an
automatic solution to binning but present a flexible framework to target
problems associated with binning. This functionality can be used in
custom workflows or programs for the steps illustrated in
Figure~\ref{fig:binning_workflow}. As input, the model incorporates
genome abundance, nucleotide composition and additionally sequence
similarity (via taxonomic annotation). The latter is common as taxonomic
binning output (Dröge, Gregor \& McHardy,
\protect\hyperlink{ref-DrogeTaxatortk2014}{2014}; Wood \& Salzberg,
\protect\hyperlink{ref-WoodKraken2014}{2014}; Gregor et al.,
\protect\hyperlink{ref-GregorPhylopythias2016}{2016}) and for quality
assessment but has rarely been systematically used as features in
binning (Chatterji et al.,
\protect\hyperlink{ref-ChatterjiCompostbin2008}{2008}; Lu et al.,
\protect\hyperlink{ref-LuCocacola2016}{2016}). We show that taxonomic
annotation is valuable information that can improve binning
considerably.

\section{Methods}\label{methods-3}

\subsection{Classification models}\label{classification-models}

Classification is a common concept in machine learning. Usually, such
algorithms use training data for different classes to construct a model
which then contains the condensed information about the important
properties that distinguish the data of the classes. In probabilistic
modeling, we describe these properties as parameters of likelihood
functions, often written as \(\theta\). After \(\theta\) has been
determined by training, the model can be applied to assign novel data to
the modeled classes. In our application, classes are genomes, or bins,
and the data are nucleotide sequences like contigs. Thus, contigs can be
assigned to genomes bins but we need to provide training sequences for
the genomes. Such data can be selected by different means, depending on
the experimental and algorithmic context. One can screen metagenomes for
genes which are unique to clades, or which can be annotated by
phylogenetic approaches, and use the corresponding sequence data for
training (Gregor et al.,
\protect\hyperlink{ref-GregorPhylopythias2016}{2016}). Independent
assemblies or reference genomes can also serve as training data for
genome bins (Brady \& Salzberg,
\protect\hyperlink{ref-BradyPhymm2009}{2009}; Patil et al.,
\protect\hyperlink{ref-PatilTaxonomic2011}{2011}; Gregor et al.,
\protect\hyperlink{ref-GregorPhylopythias2016}{2016}). Another direct
application is to learn from existing genome bins, which were derived by
any means, and then to (re)assign contigs to these bins. This is useful
for short contigs which are often excluded from binning and analysis due
to their high variability. Finally, probabilistic models can be embedded
into iterative clustering algorithms with random initialization.

\subsection{Aggregate model}\label{aggregate-model}

Let \(1\le i\le D\) be an index referring to \(D\) contigs resulting
from a shotgun metagenomic experiment. In the following we will present
a generative probabilistic aggregate model that consists of components,
indexed by \(1\le k\le M\), which are generative probabilistic models in
their own right, yielding probabilities
\(P_k(\text{contig}_i\mid\text{genome})\) that \(\text{contig}_i\)
belongs to a particular genome. Each of the components \(k\) reflects a
particular feature such as

\begin{itemize}
\tightlist
\item
  a weight \(w_i\) (contig length)
\item
  sample abundance feature vectors \(\bm{a_i}\) and \(\bm{r_i}\), one
  entry per sample
\item
  a compositional feature vector \(\bm{c_i}\), one entry per
  compositional feature (e.g.~a \(k\)-mer)
\item
  a taxonomic feature vector \(\bm{t_i}\), one entry per taxon
\end{itemize}

We define the individual feature vectors in the corresponding sections.
As mentioned before, each of the \(M\) features gives rise to a
probability \(P_k(\text{contig}_i \mid \text{genome})\) that
\(\text{contig}_i\) belongs to a specific genome by means of its
component model. Those probabilities are then collected into an
aggregate model that transforms those feature specific probabilities
\(P_k(i \mid \text{genome})\) into an overall probability
\(P(i \mid \text{genome})\) that contig \(i\) is associated with the
genome. In the following, we describe how we construct this model with
respect to the individual submodels \(P_k(i \mid \text{genome})\), the
feature representation of the contigs and how we determine the optimal
set of parameters from training sequences.

For the \(i\)\textsuperscript{th} contig, we define a joint likelihood
for genome bin \(g\) (Equation~\ref{eq:likelihood_aggregate}, the
probabilities written as a function of the genome parameters), which is
a weighted product over \(M\) independent component likelihood
functions, or submodels, for the different feature types. For the
\(k\)\textsuperscript{th} submodel, \(\bm{\mathit{\Theta_k}}\) is the
corresponding parameter vector, \(\bm{F_{i,k}}\) the feature vector of
the \(i\)\textsuperscript{th} contig and \(\alpha_k\) defines the
contribution of the respective submodel or feature type. \(\beta\) is a
free scaling parameter to adjust the smoothness of the aggregate
likelihood distribution over the genome bins (bin posterior).

\begin{equation}
\mathcal{L}(\mathbf{\Theta_g} \mid \mathbf{F_i})
= \left( \prod_{k=1}^M \mathcal{L}(\bm{\mathit{\Theta_{gk}}} \mid \bm{F_{ik}})^{\alpha_k} \right)^\beta
\label{eq:likelihood_aggregate}\end{equation}

We assume statistical independence of the feature subtypes and multiply
likelihood values from the corresponding submodels. This is a simplified
but reasonable assumption: e.g., the species abundance in a community
can be altered by external factors without impacting the nucleotide
composition of the genome or its taxonomic position. Also, there is no
direct relation between a genome's \(k\)-mer distribution and taxonomic
annotation via reference sequences.

All model parameters, \(\mathbf{\Theta_g}\), \(\bm \alpha\) and
\(\beta\), are learned from training sequences. We will explain later,
how the weight parameters \(\bm \alpha\) and \(\beta\) are chosen and
begin with a description of the four component likelihood functions, one
for each feature type.

In the following, we denote the \(j\)\textsuperscript{th} position in a
vector \(\bm{x_i}\) with \(x_{i,j}\). To simplify notation, we also
define the sum or fraction of two vectors of the same dimension as the
positional sum or fraction and write the length of vector \(\bm{x}\) as
\(len(\bm{x})\).

\subsection{Absolute abundance}\label{absolute-abundance}

We derive the average number of reads covering each contig position from
assembler output or by mapping the reads back onto contigs. This mean
coverage is a proxy for the genome abundance in the sample because it is
roughly proportional to the genome copy number. A careful library
preparation causes the copy numbers of genomes to vary differently over
samples, so that each genome has a distinct relative read distribution.
Depending on the amount of reads in each sample being associated with
every genome, we obtain for every contig a coverage vector \(\bm{a_i}\)
where \(len(\bm{a_i})\) is the number of samples. Therefore, if more
sample replicates are provided, contigs from different genomes are
generally better separable since every additional replicate adds an
entry to the feature vectors.

Random sequencing followed by perfect read assembly theoretically
produces positional read counts which are Poisson distributed, as
described in Lander \& Waterman
(\protect\hyperlink{ref-LanderGenomic1988}{1988}). In
Equation~\ref{eq:likelihood_poisson}, we derived a similar likelihood
using mean coverage values (see Supplementary Methods for details). The
likelihood function is a normalized product over the independent Poisson
functions \(P_{\theta_j}(a_{i,j})\) for each sample. The expectation
parameter \(\theta_j\) represents the genome copy number in the
\(j\)\textsuperscript{th} sample.

\begin{equation}
\mathcal{L}(\bm{\theta} \mid \bm{a_i})
= \sqrt[len(\bm{a_i})]{\prod_{j=1}^{len(\bm{a_i})} P_{\theta_j}(a_{i,j})}
= \sqrt[len(\bm{a_i})]{\prod_{j=1}^{len(\bm{a_i})} \frac{\theta_j^{a_{i,j}}}{a_{i,j} !} e^{-\theta_j}}
\label{eq:likelihood_poisson}\end{equation}

The Poisson explicitly accounts for low and zero counts, unlike a
Gaussian model. Low counts are often observed for undersequenced and
rare taxa. Note that \(a_{i,j}\) is independent of \(\bm{\theta}\). We
derived the model likelihood function from the joint Poisson over all
contig positions by approximating the first data-term with mean coverage
values (Supplementary Methods).

The maximum likelihood estimate (MLE) for \(\bm{\theta}\) on training
data is the weighted average of mean coverage values for each sample in
the training data (Supplementary Methods).

\begin{equation}
\bm{\hat \theta}
= \dfrac{ \sum\limits_{i=1}^{N} w_i \, \bm{a_i} }{ \sum\limits_{i=1}^{N} w_i }
\label{eq:mle_poisson}\end{equation}

\subsection{Relative abundance}\label{relative-abundance}

In particular for shorter contigs, the absolute read coverage is often
overestimated. Basically, the Lander-Waterman assumptions (Lander \&
Waterman, \protect\hyperlink{ref-LanderGenomic1988}{1988}) are violated
if reads do not map to their original locations due to sequencing errors
or if they \enquote{stack} on certain genome regions because they are
ambiguous (i.e.~for repeats or conserved genes), rendering the Poisson
model less appropriate. The Poisson, when constrained on the total sum
of coverages in all samples, leads to a binomial distribution as shown
by (Przyborowski \& Wilenski,
\protect\hyperlink{ref-PrzyborowskiHomogeneity1940}{1940}). Therefore,
we model differential abundance over different samples using a binomial
in which the parameters represent a relative distribution of genome
reads over the samples. For instance, if a particular genome had the
same copy number in a total of two samples, the genome's parameter
vector \(\bm{\theta}\) would simply be \([0.5,0.5]\). As for absolute
abundance, the model becomes more powerful with a higher number of
samples. Using relative frequencies as model parameters instead of
absolute coverages, however, has the advantage that any constant
coverage factor cancels in the division term. For example, if a genome
has two similar gene copies which are collapsed during assembly, twice
as many reads will map onto the assembled gene in every sample but the
relative read frequencies over samples will stay unaffected. This makes
the binomial less sensitive to read mapping artifacts but requires two
or more samples because one degree of freedom (DF) is lost by the
division.

The contig features \(\bm{r_i}\) are the mean coverages in each sample,
which is identical to \(\bm{a_i}\) in the absolute abundance model, and
the model's parameter vector \(\bm{\theta}\) holds the relative read
frequencies in the samples, as explained before. In
Equation~\ref{eq:likelihood_binomial} we ask: how likely is the observed
mean contig coverage \(r_{i,j}\) in sample \(j\) given the genome's
relative read frequency \(\theta_j\) of the sample and the contig's
total coverage \(R_{i}\) for all samples. The corresponding likelihood
is calculated as a normalized product over the binomials
\(B_{R_i,\theta_j}(r_{i,j})\) for every sample.

\begin{equation}
\mathcal{L}(\bm{\theta} \mid \bm{r_i})
= \sqrt[len(\bm{r_i})]{\prod_{j=1}^{len(\bm{r_i})}
B_{R_i,\theta_j}(r_{i,j})}
= \sqrt[len(\bm{r_i})]{\prod_{j=1}^{len(\bm{r_i})} \binom{R_i}{r_{i,j}} \theta_j^{r_{i,j}} \left( 1 - \theta_j \right)^{\left( R_i - r_{i,j} \right)}}
\label{eq:likelihood_binomial}\end{equation}

\(R_i\) is the sum of the abundance vector \(\bm{r_i}\). Because both
\(R_i\) and \(r_i\) can contain real numbers, we need to generalize the
binomial coefficient to positive real numbers via the gamma function
\(\Gamma\).

\begin{equation} {n \choose k}
= \frac{\Gamma(n+1) \Gamma(k+1)}{\Gamma(n-k+1)}
\label{eq:}\end{equation}

Because the binomial coefficient is a constant factor and independent of
\(\bm \theta\), it can be omitted in ML classification (when comparing
between different genomes) or be retained upon parameter updates. As for
the Poisson, the model accounts for low and zero counts (by the binomial
coefficient). We derived the likelihood function from the joint
distribution over all contig positions by approximating the binomial
data-term with mean coverage values (see Supplementary Methods).

The MLE \(\bm{\hat \theta}\) for the model parameters on training
sequence data corresponds to the amount of read data (base pairs) in
each sample divided by the total number of base pairs in all samples. We
express this as a weighted sum of contig mean coverage values (see
Supplementary Methods).

\begin{equation}
\bm{\hat \theta}
= \dfrac{ \sum\limits_{i=1}^N w_i \, \bm{r_i} }{ \sum\limits_{i=1}^N w_i \, R_i }
\label{eq:mle_binomial}\end{equation}

It is obvious that absolute and relative abundance models are not
independent when the identical input vectors (here
\(\bm{a_i} = \bm{r_i}\)) are used. However, we can instead apply the
Poisson model to the total coverage \(R_i\) (summed over all samples)
because this sum also follows a Poisson distribution. To illustrate the
total abundance, this compares to mixing the samples before sequencing
so that the resolution of individual samples is lost. The binomial, in
contrast, only captures the relative distribution of reads over the
samples (one DF is lost in the ratio transform). This way, we can
combine both absolute and relative abundance submodels in the aggregate
model.

\subsection{Nucleotide composition}\label{nucleotide-composition}

Microbial genomes have a distinct \enquote{genomic fingerprint} (Karlin,
Mrazek \& Campbell,
\protect\hyperlink{ref-KarlinCompositional1997}{1997}) which is
typically determined by means of \(k\)-mers. Each contig has a relative
frequency vector \(\bm{c_i}\) for all possible \(k\)-mers of size \(k\).
The nature of shotgun sequencing demands that each \(k\)-mer is counted
equally to its reverse complement because the orientation of the
sequenced strand is typically unknown. With increasing \(k\), the
feature space grows exponentially and becomes sparse. Thus, it is common
to select \(k\) from 4 to 6 (Teeling et al.,
\protect\hyperlink{ref-TeelingTetra2004}{2004}; McHardy et al.,
\protect\hyperlink{ref-MchardyAccurate2007}{2007}; Kislyuk et al.,
\protect\hyperlink{ref-KislyukUnsupervised2009}{2009}). Here, we simply
use 5-mers (\(len(\bm{c_i})\) = \(\tfrac{4^5}{2}\) = 512) but other
choices can be made.

For its simplicity and effectiveness, we chose a likelihood model
assuming statistical independence of features so that the likelihood
function in Equation~\ref{eq:likelihood_nbayes} becomes a simple product
over observation probabilities (or a linear model when transforming into
a log-likelihood). Though \(k\)-mers are not independent due to their
overlaps and reverse complementarity (Kislyuk et al.,
\protect\hyperlink{ref-KislyukUnsupervised2009}{2009}), the model has
been successfully applied to \(k\)-mers (Wang et al.,
\protect\hyperlink{ref-WangNaive2007}{2007}), and we can replace
\(k\)-mers in our model with better-suited compositional features,
i.e.~using locality-sensitive hashing (Luo et al.,
\protect\hyperlink{ref-LuoLowdensity2016}{2016}). A genome's background
distribution \(\bm{\theta}\) is a vector which holds the probabilities
to observe each \(k\)-mer and the vector \(\bm{c_i}\) does the same for
the \(i\)\textsuperscript{th} contig. The composition likelihood for a
contig is a weighted and normalized product over the background
frequencies.

\begin{equation}
\mathcal{L}(\bm{\theta} \mid \bm{c_i})
= \prod_{i=1}^{len(\bm{c_i})} \theta_i^{c_i}
\label{eq:likelihood_nbayes}\end{equation}

The genome parameter vector \(\bm{\hat \theta}\) that maximizes the
likelihood on training sequence data can be estimated by a weighted
average of feature counts (Supplementary Methods).

\begin{equation}
\bm{\hat \theta}
= \dfrac{ \sum\limits_{i=1}^{N} w_i \, \bm{c_i} }{ \sum\limits_{i=1}^{N} w_i }
\label{eq:mle_nbayes}\end{equation}

\subsection{Similarity to reference}\label{similarity-to-reference}

We can compare contigs to reference sequences, for instance by local
alignment. Two contigs that align to closely related taxa are more
likely to derive from the same genome than sequences which align to
distant clades. We convert this indirect relationship to explicit
taxonomic features which we can compare without direct consideration of
reference sequences. A taxon is a hierarchy of nested classes which can
be written as a tree path, for example, the species \emph{E. coli} could
be written as {[}Bacteria, Gammaproteobacteria, Enterobacteriaceae,
\emph{E. coli}{]}.

We assume that distinct regions of a contig, such as genes, can be
annotated with different taxa. Each taxon has a corresponding weight
which in our examples is a positive alignment score. The weighted taxa
define a spectrum over the taxonomy for every contig and genome. It is
not necessary that the alignment reference be complete or include the
respective species genome but all spectra must be equally biased. Since
each contig is represented by a hierarchy of \(L\) numeric weights, we
incorporated these features into our multi-layer model. First, each
contig's taxon weights are transformed to a set of sparse feature
vectors \(\bm{t_i} = \{\bm{t_{i,l}} \mid 1 \le l \le L\}\), one for each
taxonomic level, by inheriting and accumulating scores for higher-level
taxa (see Table~\ref{tbl:hnbayes} and Figure~\ref{fig:hnbayes_tree}).

\hypertarget{tbl:hnbayes}{}
\begin{longtable}[]{@{}llrrrr@{}}
\caption[Calculation of feature counts for layered frequency submodel]{\label{tbl:hnbayes}Calculating the contig features \(\bm{t_i}\)
for a simplified taxonomy. There are five original integer alignment
scores for nodes (c), (e), (f), (g) and (h) which are summed up at
higher levels to calculate the feature vectors \(\bm{t_{i,l}}\). The
corresponding tree structure is shown in Figure~\ref{fig:hnbayes_tree}.
}\tabularnewline
\toprule
Node & Taxon & Level \(l\) & Index \(j\) & Score &
\(t_{i,l,j}\)\tabularnewline
\midrule
\endfirsthead
\toprule
Node & Taxon & Level \(l\) & Index \(j\) & Score &
\(t_{i,l,j}\)\tabularnewline
\midrule
\endhead
a & Bacteria & 1 & 1 & 0 & 7\tabularnewline
b & Gammaproteobacteria & 2 & 1 & 0 & 6\tabularnewline
c & Betaproteobacteria & 2 & 2 & 1 & 1\tabularnewline
d & Enterobacteriaceae & 3 & 1 & 0 & 5\tabularnewline
e & Yersiniaceae & 3 & 2 & 1 & 1\tabularnewline
f & \emph{E. vulneris} & 4 & 1 & 1 & 1\tabularnewline
g & \emph{E. coli} & 4 & 2 & 3 & 3\tabularnewline
h & \emph{Yersinia sp.} & 4 & 3 & 1 & 1\tabularnewline
\bottomrule
\end{longtable}

\begin{figure}[htbp]
\centering
\includegraphics{figure/publication_mglex/main_tree.pdf}
\caption[Simplified taxonomy]{Taxonomy for which is simplified to four levels and eight nodes. A full taxonomy may consist of thousands of nodes.}\label{fig:hnbayes_tree}
\end{figure}

Each vector \(\bm{t_{i,l}}\) contains the scores for all \(T_l\)
possible taxa at level \(l\). A genome is represented by a similar set
of vectors \(\bm{\theta} = \{\bm{\theta_l} \mid 1 \le l \le L \}\) with
identical dimensions, but here, entries represent relative frequencies
on the particular level \(l\), for instance a distribution over all
family taxa. The corresponding likelihood model corresponds to a set of
simple frequency models, one for each layer. The full likelihood is a
product of the level likelihoods.

\begin{equation}
\mathcal{L}(\bm{\theta} \mid \bm{t_i})
= \prod_{l=1}^{L} \prod_{j=1}^{T_l} \theta_{l,j}^{t_{i,l,j}}
\label{eq:likelihood_hnbayes}\end{equation}

For simplicity, we assume that layer likelihoods are independent which
is not quite true but effective. The MLE for each \(\bm{\theta_l}\) is
then derived from training sequences similar to the simple frequency
model (Supplementary Methods).

\begin{equation}
\hat \theta_{l} = \frac{\sum\limits_{i=1}^N t_{i,l}}{\sum\limits_{j=1}^{T_l} \sum\limits_{i=1}^N t_{i,l}}
\label{eq:mle_hnbayes}\end{equation}

\subsection{Inference of weight
parameters}\label{inference-of-weight-parameters}

The aggregate likelihood for a contig in
Equation~\ref{eq:likelihood_aggregate} is a weighted product of submodel
likelihoods. The weights in vector \(\bm \alpha\) balance the
contributions, assuming that they must not be equal. When we write the
likelihood in logarithmic form
(Equation~\ref{eq:loglikelihood_aggregate}), we see that each weight
\(\alpha_k\) sets the variance or width of the contigs' submodel
log-likelihood distribution. We want to estimate \(\alpha_k\) in a way
which is not affected by the original submodel variance because the
corresponding normalization exponent is somewhat arbitrary. For example,
we normalized the nucleotide composition likelihood as a single feature
and the abundance likelihoods as a single sample to limit the range of
the likelihood values, because we simply cannot say how much each
feature type counts.

\begin{equation}
l(\mathbf{\Theta} \mid \mathbf{F_i})
= \beta \sum_{k=1}^{M} \alpha_k \, l(\bm{\mathit{\Theta_k}} \mid \bm{F_{i,k}})
\label{eq:loglikelihood_aggregate}\end{equation}

For any modeled genome, each of the \(M\) submodels produces a distinct
log-likelihood distribution of contig data. Based on the origin of the
contigs, which is known for model training, the distribution can be
split into two parts, the actual genome (positive class) and all other
genomes (negative class), as illustrated in
Figure~\ref{fig:alpha_inference}. The positive distribution is roughly
unimodal and close to zero whereas the negative distribution, which
represents many genomes at once, is diverse and yields strongly negative
values. Intuitively, we want to select \(\bm \alpha\) such that the
positive class is well separated from the negative class in the
aggregate log-likelihood function in
Equation~\ref{eq:loglikelihood_aggregate}.

Because \(\bm \alpha\) cannot be determined by likelihood maximization,
the contributions are balanced in a robust way by setting \(\bm \alpha\)
to the inverse standard deviation of the genome (positive class)
log-likelihood distributions. More precisely, we calculate the average
standard deviation over all genomes weighted by the amount of contig
data (bp) for each genome and calculate \(\alpha_k\) as the inverse of
this value. This scales down submodels with a high average variance.
When we normalize the standard deviation of genome log-likelihood
distributions in all submodels before summation, we assume that a high
variance means uncertainty. This form of weight estimation requires that
for at least some of the genomes, a sufficient number of sequences must
be available to estimate the standard deviation. In some instances, it
might be necessary to split long contigs into smaller sequences to
generate a sufficient number of data points for estimation.

\begin{figure}[htbp]
\centering
\includegraphics{figure/publication_mglex/main_alpha-inference.pdf}
\caption[Submodel weighting using $\alpha_k$]{Procedure for determination of for each submodel. The figure shows a schematic for a single genome and two submodels. The genome’s contig log-likelihood distribution (A and B) is scaled to a standard deviation of one (C and D) before adding the term in the aggregate model in .}\label{fig:alpha_inference}
\end{figure}

Parameter \(\beta\) in Equation~\ref{eq:loglikelihood_aggregate} is only
relevant for soft classification but not in the context of ML
classification or p-values. It can best be viewed as a sharpening or
smoothing parameter of the bin posterior distribution (the probability
of a genome or bin given the contig). \(\beta\) is estimated by
minimization of the training or test error, as in our simulation.

\subsection{Data simulation}\label{data-simulation}

We simulated reads of a complex microbial community from 400 publicly
available genomes (Supplementary Methods and Supplementary Table 1).
These comprised 295 unique and 44 species with each two or three strain
genomes to mimic strain heterogeneity. Our aim was to create a difficult
benchmark dataset under controlled settings, minimizing potential biases
introduced by specific software. We sampled abundances from a lognormal
distribution because it has been described as a realistic model (Schloss
\& Handelsman, \protect\hyperlink{ref-SchlossCensus2006}{2006}). We then
simulated a primary community which was then subject to environmental
changes resulting in exponential growth of 25\% of the community members
at growth rates which where chosen uniformly at random between one and
ten whereas the other genome abundances remained unchanged. We applied
this procedure three times to the primary community which resulted in
one primary and three secondary artificial community abundances
profiles. With these, we generated 150 bp long Illumina HiSeq reads
using the ART simulator (Huang et al.,
\protect\hyperlink{ref-HuangArt2012}{2012}) and chose a yield of 15 Gb
per sample. The exact amount of read data for all four samples after
simulation was 59.47 Gb. To avoid any bias caused by specific metagenome
assembly software and to assure a constant contig length, we divided the
original genome sequences into non-overlapping artificial contigs of 1
kb length and selected a random 500 kb of each genome to which we mapped
the simulated reads using Bowtie2 (Langmead \& Salzberg,
\protect\hyperlink{ref-LangmeadFast2012}{2012}). By the exclusion of
some genome reference, we imitated incomplete genome assemblies when
mapping reads, which affects the coverage values. Finally, we subsampled
300 kb contigs per genome with non-zero read coverage in at least one of
the samples to form the demonstration dataset (120 Mb), which has 400
genomes (including related strains), four samples and contigs of size 1
kb. Due to the short contigs and few samples, this is a challenging
dataset for complete genome recovery (Nielsen et al.,
\protect\hyperlink{ref-NielsenIdentification2014}{2014}) but suitable to
demonstrate the functioning of our model with limited data. For each
contig we derived 5-mer frequencies, taxonomic annotation (removing
species-level genomes from the reference sequence data) and average read
coverage per sample, as described in the Supplementary Methods.

\section{Results}\label{results-4}

\subsection{Maximum likelihood
classification}\label{maximum-likelihood-classification}

We evaluated the performance of the model when classifying contigs to
the genome with the highest likelihood, a procedure called Maximum
Likelihood (ML) classification. We applied a form of three-fold
cross-validation, dividing the simulated data set into three
equally-sized parts with 100 kb from every genome. We used only 100 kb
(training data) of every genome to infer the model parameters and the
other 200 kb (test data) to measure the classification error. 100 kb was
used for training because it is often difficult to identify sufficient
training data in metagenome analysis. For each combination of submodels,
we calculated the mean squared error (MSE) and mean pairwise
coclustering (MPC) probability for the predicted (ML) probability
matrices (Suppl. Methods), averaged over the three test data partitions.
We included the MPC as it can easily be interpreted: for instance, a
value of 0.5 indicates that on average 50\% of all contig pairs of a
genome end up in the same bin after classification.
Table~\ref{tbl:classification_consistency} shows that the model
integrates information from each data source such that the inclusion of
additional submodels resulted in a better MPC and also MSE, with a
single exception when combining absolute and relative abdundance models
which resulted in a marginal increase of the MSE. We also found that
taxonomic annotation represents the most powerful information type in
our simulation. For comparson, we added scores for NBC (Rosen,
Reichenberger \& Rosenfeld, \protect\hyperlink{ref-RosenNbc2011}{2011}),
a classifier based on nucleotide composition with in-sample training
using 5-mers and 15-mers, and Centrifuge (Kim et al.,
\protect\hyperlink{ref-KimCentrifuge2016}{2016}), a similarity-based
classifier both with in-sample and reference data. These programs were
given the same information as the corresponding submodels and they rank
close to these. In a further step, we investigated how the presence of
very similar genomes impacted the performance of the model. We first
collapsed strains from the same species by merging the corresponding
columns in the classification likelihood matrix, retaining the entry
with the highest likelihood, and then computed the resulting
coclustering performance increase \(\Delta\)MPC\textsubscript{ML}.
Considering assignment on species instead of strain level showed a
larger \(\Delta\)MPC\textsubscript{ML} for nucleotide composition and
taxonomic annotation than for absolute and relative abundance. This is
expected, because both do not distinguish among strains, whereas genome
abundance does in some, but not all cases.

\hypertarget{tbl:classification_consistency}{}
\begin{longtable}[]{@{}lrrr@{}}
\caption[ML classification performance]{\label{tbl:classification_consistency}Cross-validation
performance of ML classification for all possible combinations of
submodels. We calculated the mean pairwise coclustering (MPC), the
strain to species MPC improvement (\(\Delta\)MPC\textsubscript{ML}) and
the mean squared error (MSE). AbAb = absolute total abundance; ReAb =
relative abundance; NuCo = nucleotide composition; TaAn = taxonomic
annotation. NBC (v1.1) and Centrifuge (v.1.0.3b) are external
classifiers added for comparison. Best values are in bold and worst in
italic. }\tabularnewline
\toprule
Submodels & MPC\textsubscript{ML} & \(\Delta\)MPC\textsubscript{ML} &
MSE\textsubscript{ML}\tabularnewline
\midrule
\endfirsthead
\toprule
Submodels & MPC\textsubscript{ML} & \(\Delta\)MPC\textsubscript{ML} &
MSE\textsubscript{ML}\tabularnewline
\midrule
\endhead
\emph{Centrifuge (in-sample)} & \emph{0.01} & +0.01 &
0.51\tabularnewline
\emph{NBC (\(15\)-mers)} & 0.02 & \emph{+0.00} &
\emph{0.66}\tabularnewline
AbAb & 0.03 & \emph{+0.00} & 0.58\tabularnewline
ReAb & 0.08 & +0.02 & 0.61\tabularnewline
\emph{Centrifuge (reference)} & 0.13 & +0.03 & 0.45\tabularnewline
AbAb + ReAb & 0.21 & +0.04 & 0.59\tabularnewline
NuCo & 0.30 & +0.06 & 0.52\tabularnewline
\emph{NBC (\(5\)-mers)} & 0.34 & +0.06 & 0.48\tabularnewline
ReAb + NuCo & 0.41 & +0.07 & 0.48\tabularnewline
AbAb + NuCo & 0.43 & +0.08 & 0.50\tabularnewline
TaAn & 0.46 & +0.09 & 0.41\tabularnewline
AbAb + ReAb + NuCo & 0.52 & +0.09 & 0.44\tabularnewline
NuCo + TaAn & 0.52 & +0.09 & 0.40\tabularnewline
AbAb + TaAn & 0.54 & +0.09 & 0.39\tabularnewline
AbAb + NuCo + TaAn & 0.60 & +0.10 & 0.37\tabularnewline
ReAb + TaAn & 0.60 & +0.10 & 0.36\tabularnewline
ReAb + NuCo + TaAn & 0.64 & \textbf{+0.11} & 0.34\tabularnewline
AbAb + ReAb + TaAn & 0.65 & +0.10 & 0.35\tabularnewline
AbAb + ReAb + NuCo + TaAn & \textbf{0.68} & \textbf{+0.11} &
\textbf{0.33}\tabularnewline
\bottomrule
\end{longtable}

\subsection{Soft assignment}\label{soft-assignment}

The contig length of 1 kb in our simulation is considerably shorter, and
therefore harder to classify, than sequences which can be produced by
current assembly methods or by some cutting-edge sequencing platforms
(Goodwin, McPherson \& McCombie,
\protect\hyperlink{ref-GoodwinComing2016}{2016}). In practice, longer
contigs can be classified with higher accuracy than short ones, as more
information is provided as a basis for assignment. For instance, a more
robust coverage mean, a \(k\)-mer spectrum derived from more counts or
more local alignments to reference genomes can be inferred from longer
sequences. However, as short contigs remain frequent in current
metagenome assemblies, 1 kb is sometimes considered a minimum useful
contig length (Alneberg et al.,
\protect\hyperlink{ref-AlnebergBinning2014}{2014}). To account for the
natural uncertainty when assigning short contigs, one can calculate the
posterior probabilities over the genomes (see Suppl. Methods), which
results in partial assignments of each contig to the genomes. This can
reflect situations in which a particular contig is associated with
multiple genomes, for instance in case of misassemblies or the presence
of homologous regions across genomes.

The free model parameter \(\beta\) in
Equation~\ref{eq:likelihood_aggregate}, which is identical in all genome
models, smoothens or sharpens the posterior distribution: \(\beta = 0\)
produces a uniform posterior and with very high \(\beta\), the posterior
approaches the sharp ML solution. We determined \(\beta\) by optimizing
the MSE on both training and test data, shown in
Figure~\ref{fig:beta_fitting}. As expected, the classification training
error was smaller than the test error because the submodel parameters
were optimized with respect to the training data. Because the minima are
close to each other, the full aggregate model seems robust to
overfitting of \(\beta\) on training data. The comparison of soft
vs.~hard assignment shows that the former has a smaller average test
classification MSE of \(\sim\) 0.28 (the illustrated minimum in
Figure~\ref{fig:beta_fitting}) compared to the latter (ML) assignment
MSE of \(\sim\) 0.33 in Table~\ref{tbl:classification_consistency}.
Thus, soft assignment seems more suitable to classify 1 kb contigs,
which tend to produce similar likelihoods under more than one genome
model.

\begin{figure}[htbp]
\centering
\includegraphics{figure/publication_mglex/main_beta-fitting.pdf}
\caption[Training and test error as a function of $\beta$]{Model training (err) and test error (Err) as a function of $\beta$ for the complete aggregate model including all submodels and feature types. The solid curve shows the average and the colored shading the standard deviation of the three partitions in cross-validation. The corresponding optimal values for $\beta$ are marked by black dots and vertical lines. The minimum average training error is 0.238 ($\beta=2.85$) and test error is 0.279 at $\beta=1.65$.}\label{fig:beta_fitting}
\end{figure}

\subsection{Genome enrichment}\label{genome-enrichment}

Enrichment is commonly known as an experimental technique to increase
the concentration of a target substance relative to others in a probe.
Thus, an enriched metagenome still contains a mixture of different
genomes, but the target genome will be present at much higher frequency
than before. This allows a more focused analysis of the contigs or an
application of methods which seem prohibitive for the full data by
runtime or memory considerations. In the following, we demonstrate how
to filter metagenome contigs by p-value to enrich \emph{in-silico} for
specific genomes. Often, classifiers model an exhaustive list of
alternative genomes but in practice it is difficult to recognize all
species or strains in a metagenome with appropriate training data. When
we only look at individual likelihoods, for instance the maximum among
the genomes, this can be misleading if the contig comes from a missing
genome. For better judgment, a p-value tells us how frequent or extreme
the actual likelihood is for each genome. Many if not all binning
methods lack explicit significance calculations. We can take advantage
of the fact that the classification model compresses all features into a
genome likelihood and generate a null (log-)likelihood distribution on
training data for each genome. Therefore, we can associate empirical
p-values with each newly classified contig and can, for sufficiently
small p-values, reject the null hypothesis that the contig belongs to
the respective genome. Since this is a form of binary classification,
there is the risk to reject a good contig which we measure as
sensitivity.

We enriched a metagenome by first training a genome model and then
calculating the p-values of remaining contigs using this model. Contigs
with higher p-values than the chosen critical value were discarded. The
higher this cutoff is, the smaller the enriched sample becomes, but also
the target genome will be less complete. We calculated the reduced
sample size as a function of the p-value cutoff for our simulation
(Figure~\ref{fig:genome_enrichment}). Selecting a p-value threshold of
2.5\% shrinks the test data on average down to 5\% of the original size.
Instead of an empirical p-value, we could also use a parametrized
distribution or select a critical log-likelihood value by manual
inspection of the log-likelihood distribution (see
Figure~\ref{fig:alpha_inference} for an example of such a distribution).
This example shows that generally a large part of a metagenome dataset
can be discarded while retaining most of the target genome sequence
data.

\begin{figure}[htbp]
\centering
\includegraphics{figure/publication_mglex/main_genome-enrichment.pdf}
\caption[Metagenome sample enrichment]{Genome enrichment for 400 genomes with three-fold cross-validation. For each genome, we measured the test sample size relative to the full dataset after filtering by a p-value cutoff and summing over the three data partitions. The solid line shows the resulting average sample size over all 400 genomes. The variability between genomes is shown as quantiles in red. Both axes are logarithmic to show the relevant details for lower p-values cutoffs. The corresponding sensitivity, shown in Suppl. Figure 1, is approximately a linear function of the p-value.}\label{fig:genome_enrichment}
\end{figure}

\subsection{Bin analysis}\label{bin-analysis}

The model can be used to analyze bins of metagenome contigs, regardless
of the method that was used to infer these bins. Specifically, one can
measure the similarity of two bins in terms of the contig likelihood
instead of, for instance, an average euklidean distance based on the
contig or genome \(k\)-mer and abundance vectors. We compare bins to
investigate the relation between the given data, represented by the
features in the model, and their grouping into genome bins. For
instance, one could ask whether the creation of two genome bins is
sufficiently backed up by the contig data or whether they should be
merged into a single bin. For readability, we write the likelihood of a
contig in bin A to: \[
L(\theta_A \mid \text{contig}\ i) = L_i(\theta_A) = L(\theta_A) = L_A
\]

To compare two specific bins, we select the corresponding pair of
columns in the classification likelihood matrix and calculate two
mixture likelihoods for each contig (rows), \(\hat L\), using the MLE of
the parameters for both bins and \(L_{swap}\) under the hypothesis that
we swap the model parameters of both bins. The partial assignment
weights \(\hat \pi_A\) and \(\hat \pi_B\), called responsibilities, are
estimated by normalization of the two bin likelihoods.

\begin{equation}
\hat L
= \hat \pi_A \, L_A + \hat \pi_B \, L_B
= \left(\tfrac{L_A}{L_A + L_B}\right) L_A + \left(\tfrac{L_B}{L_A + L_B}\right) L_B
= \frac{L_A^2 + L_B^2}{L_A + L_B}
\label{eq:mixture_likelihood_opt}\end{equation}

\begin{equation}
L_{swap}
= \hat \pi_A \, L_B + \hat \pi_B \, L_A
= \left(\tfrac{L_A}{L_A + L_B}\right) L_B + \left(\tfrac{L_B}{L_A + L_B}\right) L_A
= \frac{2 L_A L_B}{L_A + L_B}
\label{eq:mixture_likelihood_swap}\end{equation}

For example, if \(\hat \pi_A\) and \(\hat \pi_B\) assign one third of a
contig to the first, less likely bin and two thirds to the second, more
likely bin using the optimal parameters, then \(L_{swap}\) would simply
exchange the contributions in the mixture likelihood so that one third
are assigned to the more likely and two thirds to the less likely bin.
The ratio \(L_{swap} / \hat L\) ranges from zero to one and can be seen
as a percentage similarity. We form a joint relative likelihood for all
\(N\) contigs, weighting each contig by its optimal mixture likelihood
\(\hat L\) and normalizing over these likelihood values.

\begin{equation}
\text{S}(A,B)
= \sqrt[Z]{\prod\limits_{i=1}^N \left( \frac{2 \, L_i(\theta_A) \, L_i(\theta_B)}{L_i^2(\theta_A ) + L_i^2(\theta_B)} \right)^{\tfrac{L_i^2(\theta_A) + L_i^2(\theta_B)}{L_i(\theta_A) + L_i(\theta_B)} }}
\label{eq:mixture_likelihood_similarity}\end{equation}

normalized by the total joint mixture likelihood

\begin{equation}
Z = \sum_{i=1}^N \frac{L_i^2(\theta_A) + L_i^2(\theta_B)}{L_i(\theta_A) + L_i(\theta_B)}
\label{eq:mixture_likelihood_similarity_constant}\end{equation}

The quantity in Equation~\ref{eq:mixture_likelihood_similarity} ranges
from zero to one, reaching one when the two bin models produce identical
likelihood values. We can therefore interpret the ratio as a percentage
similarity between any two bins. A connection to the Kullback-Leibler
divergence can be constructed (Supplementary Methods).

To demonstrate the application, we trained the model on our simulated
genomes, assuming they were bins, and created trees
(Figure~\ref{fig:tree_bin_comparison}) for a randomly drawn subset of 50
of the 400 genomes using the probabilistic bin distances \(-log(S)\)
(Equation~\ref{eq:mixture_likelihood_similarity}). We computed the
distances twice, first with only nucleotide composition and taxonomic
annotation submodels and second with the full feature set to compare the
bin resolution. The submodel parameters were inferred using the full
dataset and \(\beta\) using three-fold crossvalidation. We then applied
average linkage clustering to build balanced and rooted trees with equal
distance from leave to root for visual inspection. The first tree
loosely reflects phylogenetic structure corresponding to the input
features. However, many similarities over 50\% (outermost ring) show
that model and data lack the support for separating these bins. In
contrast, the fully informed tree, which additionally includes
information about contig coverages, separates the genomes bins, such
that only closely related strains remain ambiguous. This analysis shows
again that the use of additional features improves the resolution of
individual genomes and, specifically, that abundance separates similar
genomes. Most importantly, we show that our model provides a measure of
support for a genome binning. We know the taxa of the genome bins in
this example but for real metagenomes, such an analysis can reveal
binning problems and help to refine the bins as in
Figure~\ref{fig:binning_workflow}d.

\begin{figure}[htbp]
\centering
\includegraphics{figure/publication_mglex/main_bin-similarity.pdf}
\caption[Average linkage clustering of genomes using probabilistic distances]{Average linkage clustering of a random subset of 50 out of 400 genomes using probabilistic distances $-log(S)$ to demonstrate the ability of the model to measure bin resolution. This example compares the left (blue) tree, which was constructed only with nucleotide composition and taxonomic annotations, with the right (red) tree, which uses all available features. The tip labels were shortened to fit into the figure. The similarity axis is scaled as \textit{log(1-log(S))} to focus on values near one. Bins which are more than 50\% similar branch in the outermost ring whereas highly dissimilar bins branch close to the center. We created the trees by applying the R function \textit{hclust(method="average")} to MGLEX output.}\label{fig:tree_bin_comparison}
\end{figure}

\subsection{Genome bin refinement}\label{genome-bin-refinement}

We applied the model to show one of its current use cases on more
realistic data. We downloaded the medium complexity dataset from
www.cami-challenge.org. This dataset is quite complex (232 genomes, two
sample replicates). We also retrieved the results of two
highest-performing automatic binning programs, MaxBin and Metawatt, in
the CAMI challenge evaluation (Sczyrba et al.,
\protect\hyperlink{ref-SczyrbaCritical2017}{2017}). We took the simplest
possible approach: we trained MLGEX on the genome bins derived by these
methods and classified the contigs to the bins with the highest
likelihood, thus ignoring all details of contig splitting, \(\beta\) or
p-value calculation and changes in the number of genome bins. When
contigs were assigned to multiple bins with equal probability, we
attributed them to the first bin in the list because the evaluation
framework does not allow sharing contigs between bins. We only used
information provided to the contestants by the time of the challenge in
the process. We report the results for two settings for each method
using the recall, the fraction of overall assigned contigs (bp), and the
Adjusted Rand index (ARI) as defined in the CAMI evaluation paper. In
the first, we swapped contigs which were originially assigned between
bins. In the second, all available contigs were assigned to the bins,
thus maximizing the recall. Table~\ref{tbl:genome_refinement} shows that
MGLEX bin refinement improved the genome bins in terms of the ARI for
both sets of genome bins and increased the recall for Metawatt but not
MaxBin. This is likely due to the fact that MaxBin has fewer but
relatively complete bins to which the other contigs cannot correctly be
recruited. Further improvement would involve disection and merging of
bins within and among methods, for which MGLEX likelihoods can be
considered.

\hypertarget{tbl:genome_refinement}{}
\begin{longtable}[]{@{}llrrr@{}}
\caption[Genome bin refinement scores]{\label{tbl:genome_refinement}Genome bin refinement for CAMI
medium complexity dataset with 232 genomes and two samples. The recall
is the fraction of overall assigned contigs (bp). The Adjusted Rand
index (ARI) is a measure of binning precision. The unmodified genome
bins are the submissions to the
\href{https://www.cami-challenge.org}{CAMI challenge} using the
corresponding unsupervised binning methods Metawatt and MaxBin. MGLEX
swapped contigs: contigs in original genome bins reassigned to the bin
with highest MGLEX likelihood. MGLEX all contigs: all contigs (with
originally uncontained) assigned to the bin with highest MGLEX
likelihood. The lowest scores are written in italic and highest in bold.
}\tabularnewline
\toprule
Binner & Variant & Bin count & Recall (bp) & ARI\tabularnewline
\midrule
\endfirsthead
\toprule
Binner & Variant & Bin count & Recall (bp) & ARI\tabularnewline
\midrule
\endhead
Metawatt & unmodified & 285 & \emph{0.94} & \emph{0.75}\tabularnewline
Metawatt & MGLEX swapped contigs & 285 & \emph{0.94} &
\textbf{0.82}\tabularnewline
Metawatt & MGLEX all contigs & 285 & \textbf{1.00} & 0.77\tabularnewline
MaxBin & unmodified & 125 & \emph{0.82} & 0.90\tabularnewline
MaxBin & MGLEX swapped contigs & 125 & \emph{0.82} &
\textbf{0.92}\tabularnewline
MaxBin & MGLEX all contigs & 125 & \textbf{1.00} &
\emph{0.76}\tabularnewline
\bottomrule
\end{longtable}

\subsection{Implementation}\label{implementation}

We provide a Python package called MGLEX, which includes the described
model. Simple text input facilitates the integration of external
programs for feature extraction like \(k\)-mer counting or read mapping,
which are not included. MGLEX can process millions of sequences with
vectorized arithmetics using NumPy (Walt, Colbert \& Varoquaux,
\protect\hyperlink{ref-WaltNumpy2011}{2011}) and includes a command line
interface to the main functionality, such as model training,
classification, p-value and error calculations. It is open source
(GPLv3) and freely available via the
\href{https://pypi.python.org/pypi/mglex/}{Python Package
Index}\footnote{https://pypi.python.org/pypi/mglex/} and on
\href{https://www.github.com/hzi-bifo/mglex/}{GitHub}\footnote{https://www.github.com/hzi-bifo/mglex/}.

\section{Discussion}\label{discussion-3}

We describe an aggregate likelihood model for the reconstruction of
genome bins from metagenome data sets and show its value for several
applications. The model can learn from and classify nucleotide sequences
from metagenomes. It provides likelihoods and posterior bin
probabilities for existing genome bins, as well as p-values, which can
be used to enrich a metagenome dataset with a target genome. The model
can also be used to quantify bin similarity. It builds on four different
submodels that make use of different information sources in
metagenomics, namely contig coverage, nucleotide composition and
previous taxonomic assignments. By its modular design, the model can
easily be extended to include additional information sources. This
modularity also helps in interpretation and computations. The former,
because different features can be analyzed separately and the latter,
because submodels can be trained independently and in parallel.

In comparison to previously described parametric binning methods, our
model incorporates two new types of features. The first is relative
differential coverage, for which, to our knowledge, this is the first
attempt to use binomials to account for systematic bias in the read
mapping for different genome regions. As such, the binomial submodel
represents the parametric equivalent of covariance distance clustering.
The second new type is taxonomic annotation, which substantially
improved the classification results in our simulation. Taxonomic
annotations, as used in the model and in our simulation, were not
correct up to the species level and need not be, as seen in the
classification results. We only require the same annotation method be
applied to all sequences. In comparison to previous methods, our
aggregate model has weight parameters to combine the different feature
types and allows tuning the bin posterior distribution by selection of
an optimal smoothing parameter \(\beta\).

We showed that probabilistic models represent a good choice to handle
metagenomes with short contigs or few sample replicates, because they
make soft, not hard decisions, and because they can be applied in
numerous ways. When the individual submodels are trained, genome bin
properties are compressed into fewer model parameters, such as mean
values, which are mostly robust to outliers and therefore tolerate a
certain fraction of bin pollution. This property allows to reassign
contigs to bins, which we demonstrated in the \enquote{Genome bin
refinement} section. Measuring the performance of the individual
submodels and their corresponding features on short simulated contigs
(Table~\ref{tbl:classification_consistency}), we find that they
discriminate genomes or species pan-genomes by varying degrees. Genome
abundance represents, in our simulation with four samples, the weakest
single feature type, which will likely become more powerful with
increasing sample numbers. Notably, genomes of individual strains are
more difficult to distinguish than species level pangenomes using any of
the features. In practice, if not using idealized assemblies as in our
current evaluation, strain resolution poses a problem to metagenome
assembly, which is currently not resolved in a satisfactory manner
(Sczyrba et al., \protect\hyperlink{ref-SczyrbaCritical2017}{2017}).

The current MGLEX model is somewhat crude because it makes many
simplifying assumptions in the submodel definitions. For instance, the
multi-layer model for taxonomic annotation assumes that the
probabilities in different layers are independent, the series of
binomials for relative abundance should be replaced by a multinomial to
accout for the parameter dependencies or the absolute abdundance Poisson
model should incorporate overdispersion to model the data more
appropriately. Exploiting this room for improvement can lead to further
improvement in the performance while the overall framework and usage of
MGLEX stays unchanged. When we devised our model, we had an embedding
into more complex routines in mind. In the future, the model can be used
in inference procedures such as EM or MCMC to infer or improve an
existing genome binning. Thus, MGLEX provides a software package for use
in other programs. However, it also represents a powerful stand-alone
tool for the adept user in its current form.

Currently, MGLEX does not yet have support for multiple processors and
only provides the basic functionality presented here. However, training
and classification can easily be implemented in parallel because they
are expressed as matrix multiplications. The model requires sufficient
training data to robustly estimate the submodel weights \(\bm \alpha\)
using the standard deviation of the empirical log-likelihood
distributions and requires linked sequences to estimate \(\beta\) using
error minimization. In situations with a limited number of contigs per
genome bin, we therefore advise to generate linked training sequences of
a certain length, as in our simulation, for instance by splitting
assembled contigs. The optimal length for splitting may depend on the
overall fragmentation of the metagenome.

Our open-source Python package MGLEX provides a flexible framework for
metagenome analysis and binning which we intent to develop further
together with the metagenomics research community. It can be used as a
library to write new binning applications or to implement custom
workflows, for example to supplement existing binning strategies. It can
build upon a present metagenome binning by taking assignments to bins as
input and deriving likelihoods and p-values that allow for critical
inspection of the contig assignments. Based on the likelihood, MGLEX can
calculate bin similarities to provide insight into the structure of data
and community. Finally, genome enrichment of metagenomes can improve the
recovery of particular genomes in large datasets.

\section{Acknowledgments}\label{acknowledgments-1}

We thank S. Reimering, A. Weimann and A. Bremges for proofreading and
constructive feedback.

\chapter{References}\label{references}

\setlength{\parindent}{0cm} \everypar{\hangindent=.5cm}

\hypertarget{refs}{}
\hypertarget{ref-AguiarpulidoMetagenomics2016}{}
\textbf{Aguiar-Pulido V., Huang W., Suarez-Ulloa V., Cickovski T.,
Mathee K., Narasimhan G.} \textbf{2016}. Metagenomics,
Metatranscriptomics, and Metabolomics Approaches for Microbiome
Analysis. \emph{Evolutionary Bioinformatics Online} 12:5--16. DOI:
\href{https://doi.org/10.4137/EBO.S36436}{10.4137/EBO.S36436}.

\hypertarget{ref-AlbertsenGenome2013}{}
\textbf{Albertsen M., Hugenholtz P., Skarshewski A., Nielsen K are L.,
Tyson GW., Nielsen PH.} \textbf{2013}. Genome sequences of rare,
uncultured bacteria obtained by differential coverage binning of
multiple metagenomes. \emph{Nature biotechnology} 31:533--8. DOI:
\href{https://doi.org/10.1038/nbt.2579}{10.1038/nbt.2579}.

\hypertarget{ref-AlnebergBinning2014}{}
\textbf{Alneberg J., Bjarnason BS., de Bruijn I., Schirmer M., Quick J.,
Ijaz UZ., Lahti L., Loman NJ., Andersson AF., Quince C.} \textbf{2014}.
Binning metagenomic contigs by coverage and composition. \emph{Nature
Methods} 11:1144--1146. DOI:
\href{https://doi.org/10.1038/nmeth.3103}{10.1038/nmeth.3103}.

\hypertarget{ref-BaranJoint2012}{}
\textbf{Baran Y., Halperin E.} \textbf{2012}. Joint analysis of multiple
metagenomic samples. \emph{PLoS computational biology} 8:e1002373. DOI:
\href{https://doi.org/10.1371/journal.pcbi.1002373}{10.1371/journal.pcbi.1002373}.

\hypertarget{ref-BergerPerformance2011}{}
\textbf{Berger SA., Krompass D., Stamatakis A.} \textbf{2011}.
Performance, accuracy, and web server for evolutionary placement of
short sequence reads under maximum likelihood. \emph{Systematic biology}
60:291--302. DOI:
\href{https://doi.org/10.1093/sysbio/syr010}{10.1093/sysbio/syr010}.

\hypertarget{ref-BerryDeciphering2014}{}
\textbf{Berry D., Widder S.} \textbf{2014}. Deciphering microbial
interactions and detecting keystone species with co-occurrence networks.
\emph{Frontiers in Microbiology} 5. DOI:
\href{https://doi.org/10.3389/fmicb.2014.00219}{10.3389/fmicb.2014.00219}.

\hypertarget{ref-BradyPhymm2009}{}
\textbf{Brady A., Salzberg SL.} \textbf{2009}. Phymm and PhymmBL:
Metagenomic phylogenetic classification with interpolated Markov models.
\emph{Nature methods} 6:673--6. DOI:
\href{https://doi.org/10.1038/nmeth.1358}{10.1038/nmeth.1358}.

\hypertarget{ref-BradyPhymmbl2011}{}
\textbf{Brady A., Salzberg S.} \textbf{2011}. PhymmBL expanded:
Confidence scores, custom databases, parallelization and more.
\emph{Nature methods} 8:367. DOI:
\href{https://doi.org/10.1038/nmeth0511-367}{10.1038/nmeth0511-367}.

\hypertarget{ref-BremgesMecors2016}{}
\textbf{Bremges A., Singer E., Woyke T., Sczyrba A.} \textbf{2016}.
MeCorS: Metagenome-enabled error correction of single cell sequencing
reads. \emph{Bioinformatics} 32:2199--2201. DOI:
\href{https://doi.org/10.1093/bioinformatics/btw144}{10.1093/bioinformatics/btw144}.

\hypertarget{ref-BuchfinkFast2014}{}
\textbf{Buchfink B., Xie C., Huson DH.} \textbf{2014}. Fast and
sensitive protein alignment using DIAMOND. \emph{Nature Methods}
12:59--60. DOI:
\href{https://doi.org/10.1038/nmeth.3176}{10.1038/nmeth.3176}.

\hypertarget{ref-BulgarelliStructure2015}{}
\textbf{Bulgarelli D., Garrido-Oter R., Münch PC., Weiman A., Dröge J.,
Pan Y., McHardy AC., Schulze-Lefert P.} \textbf{2015}. Structure and
Function of the Bacterial Root Microbiota in Wild and Domesticated
Barley. \emph{Cell Host \& Microbe} 17:392--403. DOI:
\href{https://doi.org/10.1016/j.chom.2015.01.011}{10.1016/j.chom.2015.01.011}.

\hypertarget{ref-CamachoBlast2009}{}
\textbf{Camacho C., Coulouris G., Avagyan V., Ma N., Papadopoulos J.,
Bealer K., Madden TL.} \textbf{2009}. BLAST+: Architecture and
applications. \emph{BMC bioinformatics} 10:421. DOI:
\href{https://doi.org/10.1186/1471-2105-10-421}{10.1186/1471-2105-10-421}.

\hypertarget{ref-CarrReconstructing2013}{}
\textbf{Carr R., Shen-Orr SS., Borenstein E.} \textbf{2013}.
Reconstructing the genomic content of microbiome taxa through shotgun
metagenomic deconvolution. \emph{PLoS computational biology} 9:e1003292.
DOI:
\href{https://doi.org/10.1371/journal.pcbi.1003292}{10.1371/journal.pcbi.1003292}.

\hypertarget{ref-ChatterjiCompostbin2008}{}
\textbf{Chatterji S., Yamazaki I., Bai Z., Eisen JA.} \textbf{2008}.
CompostBin: A DNA composition-based algorithm for binning environmental
shotgun reads. In: \emph{Annual International Conference on Research in
Computational Molecular Biology}. Springer, 17--28.

\hypertarget{ref-CuvelierTargeted2010}{}
\textbf{Cuvelier ML., Allen AE., Monier A., McCrow JP., Messié M.,
Tringe SG., Woyke T., Welsh RM., Ishoey T., Lee J-H., Binder BJ., DuPont
CL., Latasa M., Guigand C., Buck KR., Hilton J., Thiagarajan M., Caler
E., Read B., Lasken RS., Chavez FP., Worden AZ.} \textbf{2010}. Targeted
metagenomics and ecology of globally important uncultured eukaryotic
phytoplankton. \emph{Proceedings of the National Academy of Sciences}
107:14679--14684. DOI:
\href{https://doi.org/10.1073/pnas.1001665107}{10.1073/pnas.1001665107}.

\hypertarget{ref-DarlingPhylosift2014}{}
\textbf{Darling AE., Jospin G., Lowe E., Matsen F a., Bik HM., Eisen J
a.} \textbf{2014}. PhyloSift: Phylogenetic analysis of genomes and
metagenomes. \emph{PeerJ} 2:e243. DOI:
\href{https://doi.org/10.7717/peerj.243}{10.7717/peerj.243}.

\hypertarget{ref-DohmSubstantial2008}{}
\textbf{Dohm JC., Lottaz C., Borodina T., Himmelbauer H.} \textbf{2008}.
Substantial biases in ultra-short read data sets from high-throughput
DNA sequencing. \emph{Nucleic Acids Research} 36:e105. DOI:
\href{https://doi.org/10.1093/nar/gkn425}{10.1093/nar/gkn425}.

\hypertarget{ref-DongReconstructing2017}{}
\textbf{Dong X., Dröge J., von Toerne C., Marozava S., McHardy AC.,
Meckenstock RU.} \textbf{2017}. Reconstructing metabolic pathways of a
member of the genus Pelotomaculum suggesting its potential to oxidize
benzene to carbon dioxide with direct reduction of sulfate. \emph{FEMS
Microbiology Ecology} 93. DOI:
\href{https://doi.org/10.1093/femsec/fiw254}{10.1093/femsec/fiw254}.

\hypertarget{ref-DrogeTaxonomic2012}{}
\textbf{Dröge J., McHardy AC.} \textbf{2012}. Taxonomic binning of
metagenome samples generated by next-generation sequencing technologies.
\emph{Briefings in Bioinformatics} 13:646--655. DOI:
\href{https://doi.org/10.1093/bib/bbs031}{10.1093/bib/bbs031}.

\hypertarget{ref-DrogeTaxatortk2014}{}
\textbf{Dröge J., Gregor I., McHardy AC.} \textbf{2014}. Taxator-tk:
Precise taxonomic assignment of metagenomes by fast approximation of
evolutionary neighborhoods. \emph{Bioinformatics (Oxford,
England)}:1--8. DOI:
\href{https://doi.org/10.1093/bioinformatics/btu745}{10.1093/bioinformatics/btu745}.

\hypertarget{ref-DrogeProbabilistic2017}{}
\textbf{Dröge J., Schönhuth A., McHardy AC.} \textbf{2017}. A
probabilistic model to recover individual genomes from metagenomes.
\emph{PeerJ Computer Science} 3:e117. DOI:
\href{https://doi.org/10.7717/peerj-cs.117}{10.7717/peerj-cs.117}.

\hypertarget{ref-EloefadroshMetagenomics2016}{}
\textbf{Eloe-Fadrosh EA., Ivanova NN., Woyke T., Kyrpides NC.}
\textbf{2016}. Metagenomics uncovers gaps in amplicon-based detection of
microbial diversity. \emph{Nature Microbiology} 1:15032. DOI:
\href{https://doi.org/10.1038/nmicrobiol.2015.32}{10.1038/nmicrobiol.2015.32}.

\hypertarget{ref-ErenAnvi2015}{}
\textbf{Eren AM., Esen ÖC., Quince C., Vineis JH., Morrison HG., Sogin
ML., Delmont TO.} \textbf{2015}. Anvi'o: An advanced analysis and
visualization platform for `omics data. \emph{PeerJ} 3:e1319. DOI:
\href{https://doi.org/10.7717/peerj.1319}{10.7717/peerj.1319}.

\hypertarget{ref-FrithParameters2010}{}
\textbf{Frith MC., Hamada M., Horton P.} \textbf{2010}. Parameters for
accurate genome alignment. \emph{BMC bioinformatics} 11:80. DOI:
\href{https://doi.org/10.1186/1471-2105-11-80}{10.1186/1471-2105-11-80}.

\hypertarget{ref-FuhrmanMarine2015}{}
\textbf{Fuhrman JA., Cram JA., Needham DM.} \textbf{2015}. Marine
microbial community dynamics and their ecological interpretation.
\emph{Nature Reviews Microbiology} 13:133--146. DOI:
\href{https://doi.org/10.1038/nrmicro3417}{10.1038/nrmicro3417}.

\hypertarget{ref-GarrettMetagenomic2010}{}
\textbf{Garrett RA., Prangishvili D., Shah SA., Reuter M., Stetter KO.,
Peng X.} \textbf{2010}. Metagenomic analyses of novel viruses and
plasmids from a cultured environmental sample of hyperthermophilic
neutrophiles. \emph{Environmental Microbiology} 12:2918--2930. DOI:
\href{https://doi.org/10.1111/j.1462-2920.2010.02266.x}{10.1111/j.1462-2920.2010.02266.x}.

\hypertarget{ref-GawadSinglecell2016}{}
\textbf{Gawad C., Koh W., Quake SR.} \textbf{2016}. Single-cell genome
sequencing: Current state of the science. \emph{Nature Reviews Genetics}
17:175--188. DOI:
\href{https://doi.org/10.1038/nrg.2015.16}{10.1038/nrg.2015.16}.

\hypertarget{ref-GerlachTaxonomic2011}{}
\textbf{Gerlach W., Stoye J.} \textbf{2011}. Taxonomic classification of
metagenomic shotgun sequences with CARMA3. \emph{Nucleic acids
research}:1--11. DOI:
\href{https://doi.org/10.1093/nar/gkr225}{10.1093/nar/gkr225}.

\hypertarget{ref-GhuryeMetagenomic2016}{}
\textbf{Ghurye JS., Cepeda-Espinoza V., Pop M.} \textbf{2016}.
Metagenomic Assembly: Overview, Challenges and Applications. \emph{The
Yale Journal of Biology and Medicine} 89:353--362.

\hypertarget{ref-GillespieIsolation2002}{}
\textbf{Gillespie DE., Brady SF., Bettermann AD., Cianciotto NP., Liles
MR., Rondon MR., Clardy J., Goodman RM., Handelsman J.} \textbf{2002}.
Isolation of Antibiotics Turbomycin A and B from a Metagenomic Library
of Soil Microbial DNA. \emph{Applied and Environmental Microbiology}
68:4301--4306. DOI:
\href{https://doi.org/10.1128/AEM.68.9.4301-4306.2002}{10.1128/AEM.68.9.4301-4306.2002}.

\hypertarget{ref-GoodwinComing2016}{}
\textbf{Goodwin S., McPherson JD., McCombie WR.} \textbf{2016}. Coming
of age: Ten years of next-generation sequencing technologies.
\emph{Nature Reviews Genetics} 17:333--351. DOI:
\href{https://doi.org/10.1038/nrg.2016.49}{10.1038/nrg.2016.49}.

\hypertarget{ref-GregorPhylopythias2014}{}
\textbf{Gregor I., Dröge J., Schirmer M., Quince C., McHardy AC.}
\textbf{2014}. PhyloPythiaS+: A self-training method for the rapid
reconstruction of low-ranking taxonomic bins from metagenomes.
\emph{arxiv.org}:1--67.

\hypertarget{ref-GregorPhylopythias2016}{}
\textbf{Gregor I., Dröge J., Schirmer M., Quince C., McHardy AC.}
\textbf{2016}. \emph{PhyloPythiaS}\emph{+}: A self-training method for
the rapid reconstruction of low-ranking taxonomic bins from metagenomes.
\emph{PeerJ} 4:e1603. DOI:
\href{https://doi.org/10.7717/peerj.1603}{10.7717/peerj.1603}.

\hypertarget{ref-HagenQuantitative2016}{}
\textbf{Hagen LH., Frank JA., Zamanzadeh M., Eijsink VGH., Pope PB.,
Horn SJ., Arntzen MØ.} \textbf{2016}. Quantitative metaproteomics
highlight the metabolic contributions of uncultured phylotypes in a
thermophilic anaerobic digester. \emph{Applied and Environmental
Microbiology}:AEM.01955--16. DOI:
\href{https://doi.org/10.1128/AEM.01955-16}{10.1128/AEM.01955-16}.

\hypertarget{ref-HamadyMicrobial2009}{}
\textbf{Hamady M., Knight R.} \textbf{2009}. Microbial community
profiling for human microbiome projects: Tools, techniques, and
challenges. \emph{Genome research} 19:1141--52. DOI:
\href{https://doi.org/10.1101/gr.085464.108}{10.1101/gr.085464.108}.

\hypertarget{ref-HandelsmanMetagenomics2004}{}
\textbf{Handelsman J.} \textbf{2004}. Metagenomics: Application of
genomics to uncultured microorganisms. \emph{Microbiology and molecular
biology reviews : MMBR} 68:669--85. DOI:
\href{https://doi.org/10.1128/MMBR.68.4.669-685.2004}{10.1128/MMBR.68.4.669-685.2004}.

\hypertarget{ref-HastieElements2001}{}
\textbf{Hastie T., Tibshirani R., Friedman J.} \textbf{2001}. \emph{The
Elements of Statistical Learning}. Springer New York Inc.

\hypertarget{ref-HauswedellLambda2014}{}
\textbf{Hauswedell H., Singer J., Reinert K.} \textbf{2014}. Lambda: The
local aligner for massive biological data. \emph{Bioinformatics}
30:i349--i355. DOI:
\href{https://doi.org/10.1093/bioinformatics/btu439}{10.1093/bioinformatics/btu439}.

\hypertarget{ref-HessMetagenomic2011}{}
\textbf{Hess M., Sczyrba A., Egan R., Kim T-W., Chokhawala H., Schroth
G., Luo S., Clark DS., Chen F., Zhang T., Mackie RI., Pennacchio L a.,
Tringe SG., Visel A., Woyke T., Wang Z., Rubin EM.} \textbf{2011}.
Metagenomic discovery of biomass-degrading genes and genomes from cow
rumen. \emph{Science (New York, N.Y.)} 331:463--7. DOI:
\href{https://doi.org/10.1126/science.1200387}{10.1126/science.1200387}.

\hypertarget{ref-HuPirs2012}{}
\textbf{Hu X., Yuan J., Shi Y., Lu J., Liu B., Li Z., Chen Y., Mu D.,
Zhang H., Li N., Yue Z., Bai F., Li H., Fan W.} \textbf{2012}. pIRS:
Profile-based Illumina pair-end reads simulator. \emph{Bioinformatics
(Oxford, England)} 28:1533--5. DOI:
\href{https://doi.org/10.1093/bioinformatics/bts187}{10.1093/bioinformatics/bts187}.

\hypertarget{ref-HuangCap31999}{}
\textbf{Huang X., Madan A.} \textbf{1999}. CAP3: A DNA Sequence Assembly
Program. \emph{Genome Research} 9:868--877. DOI:
\href{https://doi.org/10.1101/gr.9.9.868}{10.1101/gr.9.9.868}.

\hypertarget{ref-HuangArt2012}{}
\textbf{Huang W., Li L., Myers JR., Marth GT.} \textbf{2012}. ART: A
next-generation sequencing read simulator. \emph{Bioinformatics (Oxford,
England)} 28:593--4. DOI:
\href{https://doi.org/10.1093/bioinformatics/btr708}{10.1093/bioinformatics/btr708}.

\hypertarget{ref-HugenholtzExploring2002}{}
\textbf{Hugenholtz P.} \textbf{2002}. Exploring prokaryotic diversity in
the genomic era. \emph{Genome biology} 3:REVIEWS0003.

\hypertarget{ref-HugenholtzMicrobiology2008}{}
\textbf{Hugenholtz P., Tyson GW.} \textbf{2008}. Microbiology:
Metagenomics. \emph{Nature} 455:481--483. DOI:
\href{https://doi.org/10.1038/455481a}{10.1038/455481a}.

\hypertarget{ref-HusonPoor2014}{}
\textbf{Huson DH., Xie C.} \textbf{2014}. A poor man's
BLASTX--high-throughput metagenomic protein database search using PAUDA.
\emph{Bioinformatics (Oxford, England)} 30:38--9. DOI:
\href{https://doi.org/10.1093/bioinformatics/btt254}{10.1093/bioinformatics/btt254}.

\hypertarget{ref-HusonIntegrative2011}{}
\textbf{Huson DH., Mitra S., Ruscheweyh H-J., Weber N., Schuster SC.}
\textbf{2011}. Integrative analysis of environmental sequences using
MEGAN4. \emph{Genome research} 21:1552--60. DOI:
\href{https://doi.org/10.1101/gr.120618.111}{10.1101/gr.120618.111}.

\hypertarget{ref-ImelfortGroopm2014}{}
\textbf{Imelfort M., Parks D., Woodcroft BJ., Dennis P., Hugenholtz P.,
Tyson GW.} \textbf{2014}. GroopM: An automated tool for the recovery of
population genomes from related metagenomes. \emph{PeerJ} 2:e603. DOI:
\href{https://doi.org/10.7717/peerj.603}{10.7717/peerj.603}.

\hypertarget{ref-IversonUntangling2012}{}
\textbf{Iverson V., Morris RM., Frazar CD., Berthiaume CT., Morales RL.,
Armbrust EV.} \textbf{2012}. Untangling genomes from metagenomes:
Revealing an uncultured class of marine Euryarchaeota. \emph{Science}
335:587--590. DOI:
\href{https://doi.org/10.1126/science.1212665}{10.1126/science.1212665}.

\hypertarget{ref-KangMetabat2015}{}
\textbf{Kang DD., Froula J., Egan R., Wang Z.} \textbf{2015}. MetaBAT,
an efficient tool for accurately reconstructing single genomes from
complex microbial communities. \emph{PeerJ} 3:e1165. DOI:
\href{https://doi.org/10.7717/peerj.1165}{10.7717/peerj.1165}.

\hypertarget{ref-KarlinCompositional1997}{}
\textbf{Karlin S., Mrazek J., Campbell AM.} \textbf{1997}. Compositional
biases of bacterial genomes and evolutionary implications. \emph{Journal
of bacteriology} 179:3899--3913.

\hypertarget{ref-KimCentrifuge2016}{}
\textbf{Kim D., Song L., Breitwieser FP., Salzberg SL.} \textbf{2016}.
Centrifuge: Rapid and sensitive classification of metagenomic sequences.
\emph{Genome Research} 26:1721--1729. DOI:
\href{https://doi.org/10.1101/gr.210641.116}{10.1101/gr.210641.116}.

\hypertarget{ref-KislyukUnsupervised2009}{}
\textbf{Kislyuk A., Bhatnagar S., Dushoff J., Weitz JS.} \textbf{2009}.
Unsupervised statistical clustering of environmental shotgun sequences.
\emph{BMC bioinformatics} 10:316. DOI:
\href{https://doi.org/10.1186/1471-2105-10-316}{10.1186/1471-2105-10-316}.

\hypertarget{ref-KlumppNext2012}{}
\textbf{Klumpp J., Fouts DE., Sozhamannan S.} \textbf{2012}. Next
generation sequencing technologies and the changing landscape of phage
genomics. \emph{Bacteriophage} 2:190--199. DOI:
\href{https://doi.org/10.4161/bact.22111}{10.4161/bact.22111}.

\hypertarget{ref-KoslickiQuikr2013}{}
\textbf{Koslicki D., Foucart S., Rosen G.} \textbf{2013}. Quikr: A
method for rapid reconstruction of bacterial communities via compressive
sensing. \emph{Bioinformatics (Oxford, England)} 29:2096--102. DOI:
\href{https://doi.org/10.1093/bioinformatics/btt336}{10.1093/bioinformatics/btt336}.

\hypertarget{ref-KuninWrinkles2010}{}
\textbf{Kunin V., Engelbrektson A., Ochman H., Hugenholtz P.}
\textbf{2010}. Wrinkles in the rare biosphere: Pyrosequencing errors can
lead to artificial inflation of diversity estimates. \emph{Environmental
Microbiology} 12:118--123. DOI:
\href{https://doi.org/10.1111/j.1462-2920.2009.02051.x}{10.1111/j.1462-2920.2009.02051.x}.

\hypertarget{ref-LanderGenomic1988}{}
\textbf{Lander ES., Waterman MS.} \textbf{1988}. Genomic mapping by
fingerprinting random clones: A mathematical analysis. \emph{Genomics}
2:231--239. DOI:
\href{https://doi.org/10.1016/0888-7543(88)90007-9}{10.1016/0888-7543(88)90007-9}.

\hypertarget{ref-LangmeadFast2012}{}
\textbf{Langmead B., Salzberg SL.} \textbf{2012}. Fast gapped-read
alignment with Bowtie 2. \emph{Nature Methods} 9:357--359. DOI:
\href{https://doi.org/10.1038/nmeth.1923}{10.1038/nmeth.1923}.

\hypertarget{ref-LaskenRecent2014}{}
\textbf{Lasken RS., McLean JS.} \textbf{2014}. Recent advances in
genomic DNA sequencing of microbial species from single cells.
\emph{Nature Reviews Genetics} 15:577--584. DOI:
\href{https://doi.org/10.1038/nrg3785}{10.1038/nrg3785}.

\hypertarget{ref-LinAccurate2016}{}
\textbf{Lin H-H., Liao Y-C.} \textbf{2016}. Accurate binning of
metagenomic contigs via automated clustering sequences using information
of genomic signatures and marker genes. \emph{Scientific Reports} 6.
DOI: \href{https://doi.org/10.1038/srep24175}{10.1038/srep24175}.

\hypertarget{ref-LindnerMetagenomic2013}{}
\textbf{Lindner MS., Renard BY.} \textbf{2013}. Metagenomic abundance
estimation and diagnostic testing on species level. \emph{Nucleic acids
research} 41:e10. DOI:
\href{https://doi.org/10.1093/nar/gks803}{10.1093/nar/gks803}.

\hypertarget{ref-LuCocacola2016}{}
\textbf{Lu YY., Chen T., Fuhrman JA., Sun F.} \textbf{2016}. COCACOLA:
Binning metagenomic contigs using sequence COmposition, read CoverAge,
CO-alignment, and paired-end read LinkAge. \emph{Bioinformatics}:btw290.

\hypertarget{ref-LuoSoapdenovo22012}{}
\textbf{Luo R., Liu B., Xie Y., Li Z., Huang W., Yuan J., He G., Chen
Y., Pan Q., Liu Y., Tang J., Wu G., Zhang H., Shi Y., Liu Y., Yu C.,
Wang B., Lu Y., Han C., Cheung DW., Yiu S-M., Peng S., Xiaoqian Z., Liu
G., Liao X., Li Y., Yang H., Wang J., Lam T-W., Wang J.} \textbf{2012}.
SOAPdenovo2: An empirically improved memory-efficient short-read de novo
assembler. \emph{GigaScience} 1:18. DOI:
\href{https://doi.org/10.1186/2047-217X-1-18}{10.1186/2047-217X-1-18}.

\hypertarget{ref-LuoLowdensity2016}{}
\textbf{Luo Y., Zeng J., Berger B., Peng J.} \textbf{2016}. Low-density
locality-sensitive hashing boosts metagenomic binning.

\hypertarget{ref-MatsenPplacer2010}{}
\textbf{Matsen FA., Kodner RB., Armbrust EV.} \textbf{2010}. Pplacer:
Linear time maximum-likelihood and Bayesian phylogenetic placement of
sequences onto a fixed reference tree. \emph{BMC bioinformatics} 11:538.
DOI:
\href{https://doi.org/10.1186/1471-2105-11-538}{10.1186/1471-2105-11-538}.

\hypertarget{ref-MavromatisUse2007}{}
\textbf{Mavromatis K., Ivanova N., Barry K., Shapiro H., Goltsman E.,
McHardy AC., Rigoutsos I., Salamov A., Korzeniewski F., Land M., Lapidus
A., Grigoriev I., Richardson P., Hugenholtz P., Kyrpides NC.}
\textbf{2007}. Use of simulated data sets to evaluate the fidelity of
metagenomic processing methods. \emph{Nature methods} 4:495--500. DOI:
\href{https://doi.org/10.1038/nmeth1043}{10.1038/nmeth1043}.

\hypertarget{ref-MchardyAccurate2007}{}
\textbf{McHardy AC., Martín HG., Tsirigos A., Hugenholtz P., Rigoutsos
I.} \textbf{2007}. Accurate phylogenetic classification of
variable-length DNA fragments. \emph{Nature methods} 4:63--72. DOI:
\href{https://doi.org/10.1038/nmeth976}{10.1038/nmeth976}.

\hypertarget{ref-MelstedEfficient2011}{}
\textbf{Melsted P., Pritchard JK.} \textbf{2011}. Efficient counting of
k-mers in DNA sequences using a bloom filter. \emph{BMC Bioinformatics}
12:333. DOI:
\href{https://doi.org/10.1186/1471-2105-12-333}{10.1186/1471-2105-12-333}.

\hypertarget{ref-MendeImproved2016}{}
\textbf{Mende DR., Aylward FO., Eppley JM., Nielsen TN., DeLong EF.}
\textbf{2016}. Improved Environmental Genomes via Integration of
Metagenomic and Single-Cell Assemblies. \emph{Frontiers in Microbiology}
7. DOI:
\href{https://doi.org/10.3389/fmicb.2016.00143}{10.3389/fmicb.2016.00143}.

\hypertarget{ref-MillerAssembly2010}{}
\textbf{Miller JR., Koren S., Sutton G.} \textbf{2010}. Assembly
algorithms for next-generation sequencing data. \emph{Genomics}
95:315--27. DOI:
\href{https://doi.org/10.1016/j.ygeno.2010.03.001}{10.1016/j.ygeno.2010.03.001}.

\hypertarget{ref-MonzoorulhaqueSortitems2009}{}
\textbf{Monzoorul Haque M., Ghosh TS., Komanduri D., Mande SS.}
\textbf{2009}. SOrt-ITEMS: Sequence orthology based approach for
improved taxonomic estimation of metagenomic sequences.
\emph{Bioinformatics (Oxford, England)} 25:1722--30. DOI:
\href{https://doi.org/10.1093/bioinformatics/btp317}{10.1093/bioinformatics/btp317}.

\hypertarget{ref-NielsenIdentification2014}{}
\textbf{Nielsen HB., Almeida M., Juncker AS., Rasmussen S., Li J.,
Sunagawa S., Plichta DR., Gautier L., Pedersen AG., Le Chatelier E.,
Pelletier E., Bonde I., Nielsen T., Manichanh C., Arumugam M., Batto
J-M., Quintanilha dos Santos MB., Blom N., Borruel N., Burgdorf KS.,
Boumezbeur F., Casellas F., Doré J., Dworzynski P., Guarner F., Hansen
T., Hildebrand F., Kaas RS., Kennedy S., Kristiansen K., Kultima JR.,
Léonard P., Levenez F., Lund O., Moumen B., Le Paslier D., Pons N.,
Pedersen O., Prifti E., Qin J., Raes J., Sørensen S., Tap J., Tims S.,
Ussery DW., Yamada T., MetaHIT Consortium., Renault P., Sicheritz-Ponten
T., Bork P., Wang J., Brunak S., Ehrlich SD.} \textbf{2014}.
Identification and assembly of genomes and genetic elements in complex
metagenomic samples without using reference genomes. \emph{Nature
Biotechnology} 32:822--828. DOI:
\href{https://doi.org/10.1038/nbt.2939}{10.1038/nbt.2939}.

\hypertarget{ref-PatilTaxonomic2011}{}
\textbf{Patil KR., Haider P., Pope PB., Turnbaugh PJ., Morrison M.,
Scheffer T., McHardy AC.} \textbf{2011}. Taxonomic metagenome sequence
assignment with structured output models. \emph{Nature Methods}
8:191--192. DOI:
\href{https://doi.org/10.1038/nmeth0311-191}{10.1038/nmeth0311-191}.

\hypertarget{ref-PellScaling2012a}{}
\textbf{Pell J., Hintze A., Canino-Koning R., Howe A., Tiedje JM., Brown
CT.} \textbf{2012}. Scaling metagenome sequence assembly with
probabilistic de Bruijn graphs. \emph{Proceedings of the National
Academy of Sciences} 109:13272--13277.

\hypertarget{ref-PonomarovaMetabolic2015}{}
\textbf{Ponomarova O., Patil KR.} \textbf{2015}. Metabolic interactions
in microbial communities: Untangling the Gordian knot. \emph{Current
Opinion in Microbiology} 27:37--44. DOI:
\href{https://doi.org/10.1016/j.mib.2015.06.014}{10.1016/j.mib.2015.06.014}.

\hypertarget{ref-PopeIsolation2011}{}
\textbf{Pope PB., Smith W., Denman SE., Tringe SG., Barry K., Hugenholtz
P., McSweeney CS., McHardy a C., Morrison M.} \textbf{2011}. Isolation
of Succinivibrionaceae implicated in low methane emissions from Tammar
wallabies. \emph{Science (New York, N.Y.)} 333:646--8. DOI:
\href{https://doi.org/10.1126/science.1205760}{10.1126/science.1205760}.

\hypertarget{ref-PriceGenomescale2004}{}
\textbf{Price ND., Reed JL., Palsson BØ.} \textbf{2004}. Genome-scale
models of microbial cells: Evaluating the consequences of constraints.
\emph{Nature Reviews Microbiology} 2:886--897. DOI:
\href{https://doi.org/10.1038/nrmicro1023}{10.1038/nrmicro1023}.

\hypertarget{ref-PrzyborowskiHomogeneity1940}{}
\textbf{Przyborowski J., Wilenski H.} \textbf{1940}. Homogeneity of
Results in Testing Samples from Poisson Series: With an Application to
Testing Clover Seed for Dodder. \emph{Biometrika} 31:313. DOI:
\href{https://doi.org/10.2307/2332612}{10.2307/2332612}.

\hypertarget{ref-QinHuman2010}{}
\textbf{Qin J., Li R., Raes J., Arumugam M., Burgdorf KS., Manichanh C.,
Nielsen T., Pons N., Levenez F., Yamada T., Mende DR., Li J., Xu J., Li
S., Li D., Cao J., Wang B., Liang H., Zheng H., Xie Y., Tap J., Lepage
P., Bertalan M., Batto J-M., Hansen T., Le Paslier D., Linneberg A.,
Nielsen HB., Pelletier E., Renault P., Sicheritz-Ponten T., Turner K.,
Zhu H., Yu C., Li S., Jian M., Zhou Y., Li Y., Zhang X., Li S., Qin N.,
Yang H., Wang J., Brunak S., Doré J., Guarner F., Kristiansen K.,
Pedersen O., Parkhill J., Weissenbach J., Bork P., Ehrlich SD., Wang J.}
\textbf{2010}. A human gut microbial gene catalogue established by
metagenomic sequencing. \emph{Nature} 464:59--65. DOI:
\href{https://doi.org/10.1038/nature08821}{10.1038/nature08821}.

\hypertarget{ref-QuinceRational2008}{}
\textbf{Quince C., Curtis TP., Sloan WT.} \textbf{2008}. The rational
exploration of microbial diversity. \emph{The ISME journal} 2:997--1006.
DOI:
\href{https://doi.org/10.1038/ismej.2008.69}{10.1038/ismej.2008.69}.

\hypertarget{ref-QuinceAccurate2009}{}
\textbf{Quince C., Lanzén A., Curtis TP., Davenport RJ., Hall N., Head
IM., Read LF., Sloan WT.} \textbf{2009}. Accurate determination of
microbial diversity from 454 pyrosequencing data. \emph{Nature methods}
6:639--41. DOI:
\href{https://doi.org/10.1038/nmeth.1361}{10.1038/nmeth.1361}.

\hypertarget{ref-RiesenfeldUncultured2004}{}
\textbf{Riesenfeld CS., Goodman RM., Handelsman J.} \textbf{2004}.
Uncultured soil bacteria are a reservoir of new antibiotic resistance
genes. \emph{Environmental Microbiology} 6:981--989. DOI:
\href{https://doi.org/10.1111/j.1462-2920.2004.00664.x}{10.1111/j.1462-2920.2004.00664.x}.

\hypertarget{ref-RiesenfeldMetagenomics2004}{}
\textbf{Riesenfeld CS., Schloss PD., Handelsman J.} \textbf{2004}.
Metagenomics: Genomic analysis of microbial communities. \emph{Annual
review of genetics} 38:525--52. DOI:
\href{https://doi.org/10.1146/annurev.genet.38.072902.091216}{10.1146/annurev.genet.38.072902.091216}.

\hypertarget{ref-RondonCloning2000}{}
\textbf{Rondon MR., August PR., Bettermann AD., Brady SF., Grossman TH.,
Liles MR., Loiacono KA., Lynch BA., MacNeil IA., Minor C., others.}
\textbf{2000}. Cloning the soil metagenome: A strategy for accessing the
genetic and functional diversity of uncultured microorganisms.
\emph{Applied and environmental microbiology} 66:2541--2547.

\hypertarget{ref-RosenNbc2011}{}
\textbf{Rosen GL., Reichenberger ER., Rosenfeld AM.} \textbf{2011}. NBC:
The Naive Bayes Classification tool webserver for taxonomic
classification of metagenomic reads. \emph{Bioinformatics (Oxford,
England)} 27:127--9. DOI:
\href{https://doi.org/10.1093/bioinformatics/btq619}{10.1093/bioinformatics/btq619}.

\hypertarget{ref-SayersDatabase2009}{}
\textbf{Sayers EW., Barrett T., Benson D., Bryant SH., Canese K.,
Chetvernin V., Church DM., DiCuccio M., Edgar R., Federhen S., Feolo M.,
Geer LY., Helmberg W., Kapustin Y., Landsman D., Lipman DJ., Madden TL.,
Maglott DR., Miller V., Mizrachi I., Ostell J., Pruitt KD., Schuler GD.,
Sequeira E., Sherry ST., Shumway M., Sirotkin K., Souvorov A.,
Starchenko G., Tatusova T a., Wagner L., Yaschenko E., Ye J.}
\textbf{2009}. Database resources of the National Center for
Biotechnology Information. \emph{Nucleic acids research} 37:D5--15. DOI:
\href{https://doi.org/10.1093/nar/gkn741}{10.1093/nar/gkn741}.

\hypertarget{ref-SchloissnigGenomic2013}{}
\textbf{Schloissnig S., Arumugam M., Sunagawa S., Mitreva M., Tap J.,
Zhu A., Waller A., Mende DR., Kultima JR., Martin J., Kota K., Sunyaev
SR., Weinstock GM., Bork P.} \textbf{2013}. Genomic variation landscape
of the human gut microbiome. \emph{Nature} 493:45--50. DOI:
\href{https://doi.org/10.1038/nature11711}{10.1038/nature11711}.

\hypertarget{ref-SchlossCensus2006}{}
\textbf{Schloss PD., Handelsman J.} \textbf{2006}. Toward a census of
bacteria in soil. \emph{PLoS computational biology} 2:e92. DOI:
\href{https://doi.org/10.1371/journal.pcbi.0020092}{10.1371/journal.pcbi.0020092}.

\hypertarget{ref-SczyrbaCritical2017}{}
\textbf{Sczyrba A., Hofmann P., Belmann P., Koslicki D., Janssen S.,
Droege J., Gregor I., Majda S., Fiedler J., Dahms E., Bremges A., Fritz
A., Garrido-Oter R., Jorgensen TS., Shapiro N., Blood PD., Gurevich A.,
Bai Y., Turaev D., DeMaere MZ., Chikhi R., Nagarajan N., Quince C.,
Hansen LH., Sorensen SJ., Chia BKH., Denis B., Froula JL., Wang Z., Egan
R., Kang DD., Cook JJ., Deltel C., Beckstette M., Lemaitre C.,
Peterlongo P., Rizk G., Lavenier D., Wu Y-W., Singer SW., Jain C.,
Strous M., Klingenberg H., Meinicke P., Barton M., Lingner T., Lin H-H.,
Liao Y-C., Silva GGZ., Cuevas DA., Edwards RA., Saha S., Piro VC.,
Renard BY., Pop M., Klenk H-P., Goeker M., Kyrpides N., Woyke T.,
Vorholt JA., Schulze-Lefert P., Rubin EM., Darling AE., Rattei T.,
McHardy AC.} \textbf{2017}. Critical Assessment of Metagenome
Interpretation − a benchmark of computational metagenomics software.
\emph{bioRxiv}:099127. DOI:
\href{https://doi.org/10.1101/099127}{10.1101/099127}.

\hypertarget{ref-SedlarBioinformatics2017}{}
\textbf{Sedlar K., Kupkova K., Provaznik I.} \textbf{2017}.
Bioinformatics strategies for taxonomy independent binning and
visualization of sequences in shotgun metagenomics. \emph{Computational
and Structural Biotechnology Journal} 15:48--55. DOI:
\href{https://doi.org/10.1016/j.csbj.2016.11.005}{10.1016/j.csbj.2016.11.005}.

\hypertarget{ref-SegataMetagenomic2012}{}
\textbf{Segata N., Waldron L., Ballarini A., Narasimhan V., Jousson O.,
Huttenhower C.} \textbf{2012}. Metagenomic microbial community profiling
using unique clade-specific marker genes. \emph{Nature methods}:1--7.
DOI: \href{https://doi.org/10.1038/nmeth.2066}{10.1038/nmeth.2066}.

\hypertarget{ref-SilvaFocus2014}{}
\textbf{Silva GGZ., Cuevas D a., Dutilh BE., Edwards R a.}
\textbf{2014}. FOCUS: An alignment-free model to identify organisms in
metagenomes using non-negative least squares. \emph{PeerJ} 2:e425. DOI:
\href{https://doi.org/10.7717/peerj.425}{10.7717/peerj.425}.

\hypertarget{ref-StarkMltreemap2010}{}
\textbf{Stark M., Berger S., Stamatakis A., von Mering C.}
\textbf{2010}. MLTreeMap - accurate maximum likelihood placement of
environmental DNA sequences into taxonomic and functional reference
phylogenies. \emph{BMC genomics} 11:461. DOI:
\href{https://doi.org/10.1186/1471-2164-11-461}{10.1186/1471-2164-11-461}.

\hypertarget{ref-StewartGrowing2012}{}
\textbf{Stewart EJ.} \textbf{2012}. Growing Unculturable Bacteria.
\emph{Journal of Bacteriology} 194:4151--4160. DOI:
\href{https://doi.org/10.1128/JB.00345-12}{10.1128/JB.00345-12}.

\hypertarget{ref-StrousBinning2012}{}
\textbf{Strous M., Kraft B., Bisdorf R., Tegetmeyer H.} \textbf{2012}.
The Binning of Metagenomic Contigs for Microbial Physiology of Mixed
Cultures. \emph{Frontiers in Microbiology} 3. DOI:
\href{https://doi.org/10.3389/fmicb.2012.00410}{10.3389/fmicb.2012.00410}.

\hypertarget{ref-SunagawaMetagenomic2013}{}
\textbf{Sunagawa S., Mende DR., Zeller G., Izquierdo-Carrasco F., a
Berger S., Kultima JR., Coelho LP., Arumugam M., Tap J., Nielsen HB.,
Rasmussen S., Brunak S., Pedersen O., Guarner F., de Vos WM., Wang J.,
Li J., Doré J., Ehrlich SD., Stamatakis A., Bork P.} \textbf{2013}.
Metagenomic species profiling using universal phylogenetic marker genes.
\emph{Nature methods} 10:1196--9. DOI:
\href{https://doi.org/10.1038/nmeth.2693}{10.1038/nmeth.2693}.

\hypertarget{ref-SuttonTigr1995}{}
\textbf{Sutton GG., White O., Adams MD., Kerlavage AR.} \textbf{1995}.
TIGR Assembler: A New Tool for Assembling Large Shotgun Sequencing
Projects. \emph{Genome Science and Technology} 1:9--19. DOI:
\href{https://doi.org/10.1089/gst.1995.1.9}{10.1089/gst.1995.1.9}.

\hypertarget{ref-TeelingTetra2004}{}
\textbf{Teeling H., Waldmann J., Lombardot T., Bauer M., Glöckner FO.}
\textbf{2004}. TETRA: A web-service and a stand-alone program for the
analysis and comparison of tetranucleotide usage patterns in DNA
sequences. \emph{BMC bioinformatics} 5:163. DOI:
\href{https://doi.org/10.1186/1471-2105-5-163}{10.1186/1471-2105-5-163}.

\hypertarget{ref-TurnbaughInvitation2008}{}
\textbf{Turnbaugh PJ., Gordon JI.} \textbf{2008}. An Invitation to the
Marriage of Metagenomics and Metabolomics. \emph{Cell} 134:708--713.
DOI:
\href{https://doi.org/10.1016/j.cell.2008.08.025}{10.1016/j.cell.2008.08.025}.

\hypertarget{ref-TysonCommunity2004}{}
\textbf{Tyson GW., Chapman J., Hugenholtz P., Allen EE., Ram RJ.,
Richardson PM., Solovyev VV., Rubin EM., Rokhsar DS., Banfield JF.}
\textbf{2004}. Community structure and metabolism through reconstruction
of microbial genomes from the environment. \emph{Nature} 428:37--43.
DOI: \href{https://doi.org/10.1038/nature02340}{10.1038/nature02340}.

\hypertarget{ref-UfarteDiscovery2015}{}
\textbf{Ufarté L., Potocki-Veronese G., Laville É.} \textbf{2015}.
Discovery of new protein families and functions: New challenges in
functional metagenomics for biotechnologies and microbial ecology.
\emph{Frontiers in Microbiology} 6. DOI:
\href{https://doi.org/10.3389/fmicb.2015.00563}{10.3389/fmicb.2015.00563}.

\hypertarget{ref-UlyantsevMetafast2016}{}
\textbf{Ulyantsev VI., Kazakov SV., Dubinkina VB., Tyakht AV., Alexeev
DG.} \textbf{2016}. MetaFast: Fast reference-free graph-based comparison
of shotgun metagenomic data. \emph{Bioinformatics} 32:2760--2767. DOI:
\href{https://doi.org/10.1093/bioinformatics/btw312}{10.1093/bioinformatics/btw312}.

\hypertarget{ref-VenterEnvironmental2004}{}
\textbf{Venter JC., Remington K., Heidelberg JF., Halpern AL., Rusch D.,
Eisen J a., Wu D., Paulsen I., Nelson KE., Nelson W., Fouts DE., Levy
S., Knap AH., Lomas MW., Nealson K., White O., Peterson J., Hoffman J.,
Parsons R., Baden-Tillson H., Pfannkoch C., Rogers Y-H., Smith HO.}
\textbf{2004}. Environmental genome shotgun sequencing of the Sargasso
Sea. \emph{Science (New York, N.Y.)} 304:66--74. DOI:
\href{https://doi.org/10.1126/science.1093857}{10.1126/science.1093857}.

\hypertarget{ref-VinhTwophase2015}{}
\textbf{Vinh LV., Lang TV., Binh LT., Hoai TV.} \textbf{2015}. A
two-phase binning algorithm using l-mer frequency on groups of
non-overlapping reads. \emph{Algorithms for Molecular Biology} 10:2.
DOI:
\href{https://doi.org/10.1186/s13015-014-0030-4}{10.1186/s13015-014-0030-4}.

\hypertarget{ref-WaltNumpy2011}{}
\textbf{Walt S van der., Colbert SC., Varoquaux G.} \textbf{2011}. The
NumPy Array: A Structure for Efficient Numerical Computation.
\emph{Computing in Science Engineering} 13:22--30. DOI:
\href{https://doi.org/10.1109/MCSE.2011.37}{10.1109/MCSE.2011.37}.

\hypertarget{ref-WangNaive2007}{}
\textbf{Wang Q., Garrity GM., Tiedje JM., Cole JR.} \textbf{2007}. Naive
Bayesian classifier for rapid assignment of rRNA sequences into the new
bacterial taxonomy. \emph{Applied and environmental microbiology}
73:5261--7. DOI:
\href{https://doi.org/10.1128/AEM.00062-07}{10.1128/AEM.00062-07}.

\hypertarget{ref-WommackMetagenomics2008}{}
\textbf{Wommack KE., Bhavsar J., Ravel J.} \textbf{2008}. Metagenomics:
Read Length Matters. \emph{Applied and Environmental Microbiology}
74:1453--1463. DOI:
\href{https://doi.org/10.1128/AEM.02181-07}{10.1128/AEM.02181-07}.

\hypertarget{ref-WoodKraken2014}{}
\textbf{Wood DE., Salzberg SL.} \textbf{2014}. Kraken: Ultrafast
metagenomic sequence classification using exact alignments. \emph{Genome
biology} 15:R46. DOI:
\href{https://doi.org/10.1186/gb-2014-15-3-r46}{10.1186/gb-2014-15-3-r46}.

\hypertarget{ref-WoykeOne2010}{}
\textbf{Woyke T., Tighe D., Mavromatis K., Clum A., Copeland A.,
Schackwitz W., Lapidus A., Wu D., McCutcheon JP., McDonald BR., Moran N
a., Bristow J., Cheng J-F.} \textbf{2010}. One bacterial cell, one
complete genome. \emph{PloS one} 5:e10314. DOI:
\href{https://doi.org/10.1371/journal.pone.0010314}{10.1371/journal.pone.0010314}.

\hypertarget{ref-WoykeAssembling2009}{}
\textbf{Woyke T., Xie G., Copeland A., González JM., Han C., Kiss H.,
Saw JH., Senin P., Yang C., Chatterji S., Cheng J-F., Eisen J a.,
Sieracki ME., Stepanauskas R.} \textbf{2009}. Assembling the marine
metagenome, one cell at a time. \emph{PloS one} 4:e5299. DOI:
\href{https://doi.org/10.1371/journal.pone.0005299}{10.1371/journal.pone.0005299}.

\hypertarget{ref-WuPhylogenydriven2009}{}
\textbf{Wu D., Hugenholtz P., Mavromatis K., Pukall R., Dalin E.,
Ivanova NN., Kunin V., Goodwin L., Wu M., Tindall BJ., Hooper SD., Pati
A., Lykidis A., Spring S., Anderson IJ., D'haeseleer P., Zemla A.,
Singer M., Lapidus A., Nolan M., Copeland A., Han C., Chen F., Cheng
J-F., Lucas S., Kerfeld C., Lang E., Gronow S., Chain P., Bruce D.,
Rubin EM., Kyrpides NC., Klenk H-P., Eisen J a.} \textbf{2009}. A
phylogeny-driven genomic encyclopaedia of Bacteria and Archaea.
\emph{Nature} 462:1056--60. DOI:
\href{https://doi.org/10.1038/nature08656}{10.1038/nature08656}.

\hypertarget{ref-WuMaxbin2014}{}
\textbf{Wu Y-W., Tang Y-H., Tringe SG., Simmons BA., Singer SW.}
\textbf{2014}. MaxBin: An automated binning method to recover individual
genomes from metagenomes using an expectation-maximization algorithm.
\emph{Microbiome} 2:26. DOI:
\href{https://doi.org/10.1186/2049-2618-2-26}{10.1186/2049-2618-2-26}.

\hypertarget{ref-YuMicrofluidicbased2017}{}
\textbf{Yu F., Blainey PC., Schulz F., Woyke T., Horowitz MA., Quake
SR.} \textbf{2017}. Microfluidic-based mini-metagenomics enables
discovery of novel microbial lineages from complex environmental
samples. \emph{bioRxiv}:114496. DOI:
\href{https://doi.org/10.1101/114496}{10.1101/114496}.

\hypertarget{ref-ZhaoRapsearch22011}{}
\textbf{Zhao Y., Tang H., Ye Y.} \textbf{2011}. RAPSearch2: A fast and
memory-efficient protein similarity search tool for next generation
sequencing data. \emph{Bioinformatics} 28:125--126. DOI:
\href{https://doi.org/10.1093/bioinformatics/btr595}{10.1093/bioinformatics/btr595}.

\sloppy
\printglossary[nonumberlist]

%-------------------------------------
%	Original PDFs (supplements)
%-------------------------------------
\begin{appendices}
\addtocontents{toc}{\protect\setcounter{tocdepth}{0}}
\chapter{\texorpdfstring{Supplementary Material ``\emph{Taxator-tk}:
Precise Taxonomic Assignment of Metagenomes by Fast Approximation of
Evolutionary
Neighborhoods''}{Supplementary Material Taxator-tk: Precise Taxonomic Assignment of Metagenomes by Fast Approximation of Evolutionary Neighborhoods}}\label{supplementary-material-taxator-tk-precise-taxonomic-assignment-of-metagenomes-by-fast-approximation-of-evolutionary-neighborhoods}

\includepdf[pages={-},angle=0,offset=20 0,scale=0.9,fitpaper,rotateoversize]{supplement/supplement_01_taxatortk.pdf}

\chapter{\texorpdfstring{Supplementary Material for ``A Probabilistic
Model to Recover Genomes in Shotgun
Metagenomics''}{Supplementary Material for A Probabilistic Model to Recover Genomes in Shotgun Metagenomics}}\label{supplementary-material-for-a-probabilistic-model-to-recover-genomes-in-shotgun-metagenomics}

\includepdf[pages={-},angle=0,offset=20 0,scale=0.9,fitpaper,rotateoversize]{supplement/supplement_02_mglex.pdf}

\chapter{Taxonomic Binning of Metagenome Samples Generated by
Next-generation Sequencing Technologies}\label{sec:full_bib}

\textbf{J. Dröge}\textsuperscript{1,2} and \textbf{A. C.
McHardy}\textsuperscript{1,2*}

\textsuperscript{1}Max Planck Research Group for Computational Genomics
\& Epidemiology, Max Planck Institute for Informatics, Campus E1 4,
66123 Saarbrücken, Germany

\textsuperscript{2}Department of Algorithmic Bioinformatics, Heinrich
Heine University Düsseldorf, Institute for Computer Science,
Universitätsstraße 1, 40225 Düsseldorf, Germany

\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}

\textbf{This is a pre-copyedited, author-produced version of an article
accepted for publication in
\emph{\href{https://bib.oxfordjournals.org}{Briefings in
Bioinformatics}} following peer review. This article version has been
adapted to the thesis layout. The original article is available online
by DOI \href{https://doi.org/10.1093/bib/bbs031}{10.1093/bib/bbs031}.}

\section{Abstract}\label{abstract}

Metagenome research uses random shotgun sequencing of microbial
community DNA to study the genetic sequences of its members without
cultivation. This development has been strongly supported by
improvements in sequencing technologies, which have rendered sequencing
cheaper than before. As a consequence, downstream computational analysis
of metagenome sequence samples is now faced with large amounts of
complex data. One of the essential steps in metagenome analysis is
reconstruction of draft genomes for populations of a community or of
draft `pan-genomes' for higher-level clades. `Taxonomic binning'
corresponds to the process of assigning a taxonomic identifier to
sequence fragments, based on information such as sequence similarity,
sequence composition or read coverage. This is used for draft genome
reconstruction, if sequencing coverage is insufficient for
reconstruction based on assembly information alone. Subsequent
functional and metabolic annotation of draft genomes allows a
genome-level analysis of novel uncultured microbial species and even
inference of their cultivation requirements.

\section{Introduction}\label{introduction}

The application of genome sequencing technologies to the study of an
entire community of microbial organisms, as opposed to a clonal culture
of an individual isolate strain, is known as metagenomics (Kunin et al.,
2008; Simon \& Daniel, 2011). Such analysis allows one to determine
genome sequence information for a vast portion of the microbial world
for which cultivation conditions are unknown or difficult to reproduce
under laboratory conditions (Amann, Ludwig \& Schleifer, 1995;
Hugenholtz, 2002). Even the first metagenome studies, investigating the
Sargasso Sea (Venter et al., 2004) and Minnesota farm soil (Tringe et
al., 2005), were able to demonstrate the enormous potential of the
microbial world to serve as a treasure trove of genes with novel
functionalities, as these studies resulted in the discovery of many
thousands of new gene sequences that were only remotely similar to genes
of known function. They also revealed the unexpected complexity of
microbial communities in terms of the number of taxa contained therein.
Since then, much research has explored microbial ecosystems, soil,
aquatic and host-associated, in more detail (Woyke et al., 2006;
Warnecke et al., 2007; Turnbaugh et al., 2010; Suen et al., 2010;
Mackelprang et al., 2011), and has revealed a great wealth of novel
genetic information from microbial species that are only distantly
related to well studied model organisms.

Both amplicon sequencing and random shotgun sequencing of microbial
communities are sometimes referred to as metagenomics. Amplicon
sequencing, or environmental tag sequencing, is used to determine the
taxonomic composition and phylogenetic structure of a microbial
community. In amplicon sequencing, informative marker regions of the
genomes from a microbial community are amplified by polymerase chain
reaction, and used as a proxy to determine which phylotypes or
operational taxonomic units (OTUs) are present in a microbial community,
and their relative abundance. Commonly used markers regions are the
ribosomal genes (Huse et al., 2008) and the ITS (internal transcribed
spacer) region (Jeewon \& Hyde, 2007), which is positioned between
ribosomal genes. In terms of numbers and the evolutionary closeness of
the distinct species present, microbial community profiles can be
correlated across environments and communities, linked to environmental
parameters. They can be indicative of the presence of genes that are
relevant for particular metabolic functionalities (Fuhrman, 2009), given
that the respective genes are already known. However, the gene inventory
and the encoded functionality of most microbial species are largely
unknown and may also vary considerably between strains.

Shotgun sequencing can be used to study the genetic information of
microbial communities by sequencing DNA that has been extracted and
randomly sheared into smaller fragments. Even though subject to
different technology-dependent biases, this procedure allows functional
and process-level characterization of microbial communities as a whole
and the reconstruction of draft genome sequences for individual
community members.

\section{Next-generation sequencing
technologies}\label{next-generation-sequencing-technologies}

DNA sequencing technologies have rapidly advanced over the last five
years and these developments have substantially shaped the way
metagenome research is performed. Post-Sanger sequencing technologies
are commonly referred to as next-generation sequencing (NGS) (Mardis,
2008; Metzker, 2009). In comparison to Sanger sequencing, NGS methods
can sequence DNA more quickly and at lower cost through massive
parallelization. This is generally achieved by amplification and
fixation of millions of individual template molecules or their enzyme
counterparts on a solid phase prior to sequencing. While Sanger
sequencing results in read lengths of around 800 bp, the commercially
available NGS technologies (tbl.~\ref{tbl:bib_sequencing_technologies})
currently generate reads of approximately 50--75 bp (Applied
Biosciences/Life Technologies -- SOLiD), 75--150 bp (Solexa/Illumina --
Sequencing by Synthesis), 100--200 bp (IonTorrent/Life Technologies --
Semiconductor Chip Sequencing) and 550--1000 bp (454/Roche --
Pyrosequencing). The upcoming generation (Schadt, Turner \& Kasarskis,
2010; Thompson \& Milos, 2011) of sequencers using single molecule
sequencing produces read lengths of over 1 kb (PacBio, SMRT, 15--20\%
assumed error rate (Schadt, Turner \& Kasarskis, 2010)) and of 5--10 kb
(Oxford Nanopore technology, 5\% assumed error rate). Besides different
read lengths and amounts of sequence data produced, each technology has
a characteristic profile of sequencing errors, resulting from the
technology-specific preparation and detection procedures. The choice of
an appropriate sequencing technology depends on the scientific questions
asked. For instance, while an 80 bp read is sufficient to cover a
hyper-variable region in the 16S gene (Huse et al., 2008) for analysis
of microbial community composition, de novo recovery of draft microbial
genome sequences by taxonomic binning from a complex organismal mixture
requires substantially longer reads or higher sequencing depth and
sequencing of short paired reads (Turnbaugh et al., 2010; Hess et al.,
2011; Mackelprang et al., 2011; Iverson et al., 2012).

\hypertarget{tbl:bib_sequencing_technologies}{}
\begin{longtable}[]{@{}llllll@{}}
\caption[Throughput and read lengths of sequencing technologies]{\label{tbl:bib_sequencing_technologies}Throughput and read
lengths of different sequencing technologies. *: Normalized throughput
is scaled to a one-hour period and rounded. **: The throughput scale is
compared to Life Technologies 3730 Sanger chemistry based sequencer and
shows the ratio of throughput values in terms of order of magnitude.
***: Numbers are based on vendor information: Illumina Inc.
(www.illumina.com), Life Technologies (www.lifetechnologies.com),
Roche/454 (www.454.com). Due to lack of information on sequencing
statistics or commercial availability, Pacific Biosciences
(www.pacificbiosciences.com), Oxford Nanopore Technologies
(www.nanoporetech.com) and Helicos Biosciences (www.helicosbio.com) are
excluded. }\tabularnewline
\toprule
\begin{minipage}[b]{0.30\columnwidth}\raggedright\strut
Manufacturer \& technology\strut
\end{minipage} & \begin{minipage}[b]{0.08\columnwidth}\raggedright\strut
Length (bp)\strut
\end{minipage} & \begin{minipage}[b]{0.15\columnwidth}\raggedright\strut
Through- put***\strut
\end{minipage} & \begin{minipage}[b]{0.13\columnwidth}\raggedright\strut
Normalized throughput* (Mb/h)\strut
\end{minipage} & \begin{minipage}[b]{0.10\columnwidth}\raggedright\strut
Through- put scale**\strut
\end{minipage} & \begin{minipage}[b]{0.08\columnwidth}\raggedright\strut
Time per run\strut
\end{minipage}\tabularnewline
\midrule
\endfirsthead
\toprule
\begin{minipage}[b]{0.30\columnwidth}\raggedright\strut
Manufacturer \& technology\strut
\end{minipage} & \begin{minipage}[b]{0.08\columnwidth}\raggedright\strut
Length (bp)\strut
\end{minipage} & \begin{minipage}[b]{0.15\columnwidth}\raggedright\strut
Through- put***\strut
\end{minipage} & \begin{minipage}[b]{0.13\columnwidth}\raggedright\strut
Normalized throughput* (Mb/h)\strut
\end{minipage} & \begin{minipage}[b]{0.10\columnwidth}\raggedright\strut
Through- put scale**\strut
\end{minipage} & \begin{minipage}[b]{0.08\columnwidth}\raggedright\strut
Time per run\strut
\end{minipage}\tabularnewline
\midrule
\endhead
\begin{minipage}[t]{0.30\columnwidth}\raggedright\strut
Solexa/ Illumina Sequencing by Synthesis\strut
\end{minipage} & \begin{minipage}[t]{0.08\columnwidth}\raggedright\strut
100 -- 150\strut
\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\raggedright\strut
300 Gb/8.5 d -- 600 Gb/11 d\strut
\end{minipage} & \begin{minipage}[t]{0.13\columnwidth}\raggedright\strut
1,500 -- 2,300\strut
\end{minipage} & \begin{minipage}[t]{0.10\columnwidth}\raggedright\strut
10\textsuperscript{4}\strut
\end{minipage} & \begin{minipage}[t]{0.08\columnwidth}\raggedright\strut
8.5 d -- 11 d\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.30\columnwidth}\raggedright\strut
Life Technologies/ Applied Biosystems SOLiD\strut
\end{minipage} & \begin{minipage}[t]{0.08\columnwidth}\raggedright\strut
50 -- 75\strut
\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\raggedright\strut
7 Gb/d -- 20 Gb/d\strut
\end{minipage} & \begin{minipage}[t]{0.13\columnwidth}\raggedright\strut
300 -- 800\strut
\end{minipage} & \begin{minipage}[t]{0.10\columnwidth}\raggedright\strut
10\textsuperscript{3} - 10\textsuperscript{4}\strut
\end{minipage} & \begin{minipage}[t]{0.08\columnwidth}\raggedright\strut
2 d -- 7 d\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.30\columnwidth}\raggedright\strut
Life Technologies/ Ion Torrent\strut
\end{minipage} & \begin{minipage}[t]{0.08\columnwidth}\raggedright\strut
100 -- 200\strut
\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\raggedright\strut
10 Mb/2 h -- 1 Gb/2 h\strut
\end{minipage} & \begin{minipage}[t]{0.13\columnwidth}\raggedright\strut
5 -- 500\strut
\end{minipage} & \begin{minipage}[t]{0.10\columnwidth}\raggedright\strut
10\textsuperscript{1} - 10\textsuperscript{3}\strut
\end{minipage} & \begin{minipage}[t]{0.08\columnwidth}\raggedright\strut
2 h\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.30\columnwidth}\raggedright\strut
Roche/ 454 Pyrosequencing\strut
\end{minipage} & \begin{minipage}[t]{0.08\columnwidth}\raggedright\strut
550 -- 1000\strut
\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\raggedright\strut
450 Mb/10 h -- 700 Mb/23 h\strut
\end{minipage} & \begin{minipage}[t]{0.13\columnwidth}\raggedright\strut
30 -- 45\strut
\end{minipage} & \begin{minipage}[t]{0.10\columnwidth}\raggedright\strut
10\textsuperscript{2}\strut
\end{minipage} & \begin{minipage}[t]{0.08\columnwidth}\raggedright\strut
10 h -- 23 h\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.30\columnwidth}\raggedright\strut
Life Technologies Capillary Sanger sequencing\strut
\end{minipage} & \begin{minipage}[t]{0.08\columnwidth}\raggedright\strut
600 -- 900\strut
\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\raggedright\strut
690 Kb/d -- 2,100 Kb/d\strut
\end{minipage} & \begin{minipage}[t]{0.13\columnwidth}\raggedright\strut
0.029 -- 0.088\strut
\end{minipage} & \begin{minipage}[t]{0.10\columnwidth}\raggedright\strut
10\textsuperscript{0}\strut
\end{minipage} & \begin{minipage}[t]{0.08\columnwidth}\raggedright\strut
\textasciitilde{} 7 h {[}15{]}\strut
\end{minipage}\tabularnewline
\bottomrule
\end{longtable}

\section{Bioinformatic analysis of metagenome
samples}\label{bioinformatic-analysis-of-metagenome-samples}

NGS produces large volumes of sequence data
(tbl.~\ref{tbl:bib_sequencing_technologies}). Currently, a single run of
an Illumina HiSeq machine generates up to 600 Gb per run
(www.illumina.com), which is of the order of 104 times the amount of
data produced in a similar timeframe by a Sanger sequencing chemistry
based sequencer (tbl.~\ref{tbl:bib_sequencing_technologies}). This, in
turn, results in drastically increased runtimes for all the
bioinformatics procedures applied in metagenomics (Wilkening et al.,
2009), such as assembly of sequence fragments, taxonomic binning,
prediction of protein encoding genes, as well as functional and
process-level gene annotation. Together, taxonomic binning and assembly
allow draft genome reconstructions for community members for which
sequencing has recovered substantial amounts of sequence. Assembly
corresponds to the computational process of placing individual reads
into longer pieces of contiguous sequences, known as contigs, based on
sequence overlaps and paired read information. Taxonomic binning sorts
the contigs of a metagenome sample into `bins' that represent the
populations or higher-level clades of community members. Though both
tasks are performed independently and evaluate different types of
information, the problem of metagenome sequence assembly is closely
related to taxonomic binning, as both allow the reconstruction of draft
genome sequences. The terms ``taxonomic'' and ``phylogenetic'' binning
are both used in the literature, as modern taxonomies such as the NCBI
taxonomy (Sayers et al., 2009) or the ribosomal gene based RDP-II (Cole
et al., 2009), GreenGenes (DeSantis et al., 2006) and ARB-SILVA (Pruesse
et al., 2007) taxonomies are built upon phylogenetic principles. Even
though it is less consistent, taxonomic binning software for shotgun
metagenomics most frequently relies on the NCBI taxonomy, probably due
to its widespread use in annotation of public sequence data.

Similar to the assembly of individual isolated genomes (Miller, Koren \&
Sutton, 2010), assembly in metagenomics aims to recover long contiguous
pieces of sequence from the sequence collection of reads that represent
parts of the genomes of individual community members. Massively
increased amounts of data, varying organism abundances within a sampled
community, differing complexities in terms of the overall number of
organisms contained and the presence of multiple closely related
organisms all challenge the sequence assemblers that were originally
designed for isolated genomes. To address these challenges, methods
designed for assembly of microbial community NGS data (Laserson, Jojic
\& Koller, 2011; Peng et al., 2011; Koren, Treangen \& Pop, 2011; Pell
et al., 2012) are being developed. Paired-end or mate-pair protocols,
which add distance information between two individual reads, can greatly
aid in the assembly process. Assembly information such as the ordering
of contigs within a scaffold can also be used to check binning quality,
and binning has been used to refine assembly in a feedback process. In
recent studies, the joint analysis of assembly information and sequence
composition allowed the reconstruction of several partial genomes by
taxonomic binning (Hess et al., 2011; Iverson et al., 2012). Thus, a
closer integration of the two approaches appears promising for draft
genome reconstruction from NGS metagenome data.

Following assembly and binning, further bioinformatic analyses include
the prediction of genes, as well as functional annotation and
reconstruction of potential pathways. For these steps, dedicated web
servers exist, such as MG-RAST (Meyer et al., 2008), IMG/M (Markowitz et
al., 2012) and CAMERA (Sun et al., 2011). Analysis of the gene content
of individual bins allows inference of the functional and metabolic
capabilities of individual community members, and allows a metagenome
sample to be studied in its entirety. If read lengths or sequencing
depth are insufficient for assembly, the functional analysis of a
metagenome sample is restricted to what can be inferred without partial
genome reconstructions for individual community members.

\section{Binning strategies}\label{binning-strategies}

The term binning was originally coined for the problem of separating the
sequence fragments of a metagenome according to the microbial
populations they originate from (Tyson et al., 2004; Woyke et al.,
2006). The definition has been extended to include bins that represent
all fragments that originate from a common higher-level clade, in cases
where resolution down to individual populations is not possible. For
placement of sequence fragments into taxonomic bins, attributes which
are indicative of the taxonomic origin of a fragment are evaluated.
Different types of information can be used for this purpose: (a) local
sequence similarity to sequences of known taxa (used in similarity-based
taxonomic assignment), (b) similarity in sequence composition to
sequences of a given taxon (used in composition-based taxonomic
assignment) or to other sequences in the sample (used in
composition-based clustering), or (c) similarity in read coverage and
linkage information from assembly for contigs within a metagenome
sample. The underlying rationale of using read coverage is that similar
coverage of two contigs in the sample indicates similar abundance and
therefore potentially the same underlying source population in the
community.

How accurately fragments can be assigned to taxonomic bins depends on
several factors. The first is fragment length. Shorter, noisier
fragments cannot be assigned as accurately as longer fragments of 2 kb
or more (Patil et al., 2011). In particular, assignment of individual
reads or of fragments below 1 kb in length poses significant challenges.
Reported assignment accuracies for 100 bp fragments to a clade at the
genus level are 60\% under somewhat idealized conditions, with only
reference data from the same species being removed. This, however, means
that 40\% of fragments are misassigned (Brady \& Salzberg, 2009).
Furthermore, accuracy drops to less than 30\% if the reference data is
depleted of sequences from the same genus, meaning 70\% of 100 bp
fragments are misassigned at the family level.

Another influential factor for binning accuracy is the community's
complexity in terms of the number of distinct phylotypes it comprises.
Metagenome sequencing of complex communities, such as those found in
soil (Mackelprang et al., 2011), results in lower sequencing coverage of
most populations and therefore shorter contigs in assembly. This amounts
to many short fragments, or even predominantly unassembled samples,
which have to be separated into a multitude of taxonomic bins. The
larger the number of bins, the harder the problem becomes, as the
chances of randomly assigning a fragment correctly decrease with
increasing numbers of bins. Finally, for taxonomic assignment, the
availability of reference data from taxa that are closely related to the
microbes of the sequenced community is important for accurate
assignment. Similarity-based assignment of metagenome shotgun sequence
data requires homologous reference sequences from related taxa to be
available for a fragment to be assigned; ideally, entire sequenced
genomes should be available. The sequencing of many isolate genomes of
the human microbiome in the Human Microbiome Project has immensely
helped similarity-based taxonomic assignment of human gut metagenome
samples (Qin et al., 2010; Nelson et al., 2010). A `shallow' (i.e.~to
high-ranking clades only) taxonomic assignment of a sample based on
sequence similarities indicates the presence of many taxa that are only
distantly related to isolated sequenced genomes. If no sequenced genomes
from related taxa are available, composition-based assignment can be
used for higher resolution taxonomic binning. Clustering of metagenome
fragments based on sequence composition does not require reference
sequences and comparably small amounts of non-homologous reference
sequences are required for composition-based taxonomic classification.

\hypertarget{tbl:bib_binning_web}{}
\begin{longtable}[]{@{}lcccl@{}}
\caption[Web applications for metagenome binning]{\label{tbl:bib_binning_web}Overview of existing web
applications for taxonomic assignment and phylotyping of metagenome
sequence samples. Phylotyping methods assign only a subset of contigs
based on taxonomic marker genes. }\tabularnewline
\toprule
\begin{minipage}[b]{0.23\columnwidth}\raggedright\strut
Name\strut
\end{minipage} & \begin{minipage}[b]{0.09\columnwidth}\centering\strut
Phylo- typing\strut
\end{minipage} & \begin{minipage}[b]{0.09\columnwidth}\centering\strut
Tax. assign- ment\strut
\end{minipage} & \begin{minipage}[b]{0.09\columnwidth}\centering\strut
Funct. anno- tation\strut
\end{minipage} & \begin{minipage}[b]{0.36\columnwidth}\raggedright\strut
Techniques \& web link\strut
\end{minipage}\tabularnewline
\midrule
\endfirsthead
\toprule
\begin{minipage}[b]{0.23\columnwidth}\raggedright\strut
Name\strut
\end{minipage} & \begin{minipage}[b]{0.09\columnwidth}\centering\strut
Phylo- typing\strut
\end{minipage} & \begin{minipage}[b]{0.09\columnwidth}\centering\strut
Tax. assign- ment\strut
\end{minipage} & \begin{minipage}[b]{0.09\columnwidth}\centering\strut
Funct. anno- tation\strut
\end{minipage} & \begin{minipage}[b]{0.36\columnwidth}\raggedright\strut
Techniques \& web link\strut
\end{minipage}\tabularnewline
\midrule
\endhead
\begin{minipage}[t]{0.23\columnwidth}\raggedright\strut
CAMERA v.2 (2011)\strut
\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\centering\strut
X\strut
\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\centering\strut
---\strut
\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\centering\strut
X\strut
\end{minipage} & \begin{minipage}[t]{0.36\columnwidth}\raggedright\strut
Reverse Psi-BLAST http://camera.calit2.net\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.23\columnwidth}\raggedright\strut
MetaABC (2011)\strut
\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\centering\strut
---\strut
\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\centering\strut
X\strut
\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\centering\strut
---\strut
\end{minipage} & \begin{minipage}[t]{0.36\columnwidth}\raggedright\strut
BLAST, PhymmBL, MEGAN, Sort-ITEMS http://bits2.iis.sinica.edu.tw/
MetaABC/\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.23\columnwidth}\raggedright\strut
MG-RAST v.3.1.2 (2008)\strut
\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\centering\strut
X\strut
\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\centering\strut
---\strut
\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\centering\strut
X\strut
\end{minipage} & \begin{minipage}[t]{0.36\columnwidth}\raggedright\strut
BLAST/BLAT http://metagenomics.anl.gov\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.23\columnwidth}\raggedright\strut
MLTreeMap v.2.06.1 (2010)\strut
\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\centering\strut
X\strut
\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\centering\strut
---\strut
\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\centering\strut
X\strut
\end{minipage} & \begin{minipage}[t]{0.36\columnwidth}\raggedright\strut
BLAST, HMMER, RaxML http://mltreemap.org\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.23\columnwidth}\raggedright\strut
NBC v.1.1 CLI (2011)\strut
\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\centering\strut
---\strut
\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\centering\strut
X\strut
\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\centering\strut
---\strut
\end{minipage} & \begin{minipage}[t]{0.36\columnwidth}\raggedright\strut
Naïve Bayesian Classifier http://nbc.ece.drexel.edu\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.23\columnwidth}\raggedright\strut
PhyloPythia (2007), PhyloPythiaS (2011)\strut
\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\centering\strut
---\strut
\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\centering\strut
X\strut
\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\centering\strut
---\strut
\end{minipage} & \begin{minipage}[t]{0.36\columnwidth}\raggedright\strut
(Structured) Support Vector Machine
http://binning.bioinf.mpi-inf.mpg.de\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.23\columnwidth}\raggedright\strut
TaxSOM (2011)\strut
\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\centering\strut
---\strut
\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\centering\strut
X\strut
\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\centering\strut
---\strut
\end{minipage} & \begin{minipage}[t]{0.36\columnwidth}\raggedright\strut
Self-Organizing Maps http://soma.arb-silva.de\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.23\columnwidth}\raggedright\strut
WebCARMA v.3.0 (2011)\strut
\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\centering\strut
X\strut
\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\centering\strut
X\strut
\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\centering\strut
X\strut
\end{minipage} & \begin{minipage}[t]{0.36\columnwidth}\raggedright\strut
BLAST, HMM search versus Pfam
http://webcarma.cebitec.uni-bielefeld.de\strut
\end{minipage}\tabularnewline
\bottomrule
\end{longtable}

\section{Taxonomic binning based on sequence
similarities}\label{taxonomic-binning-based-on-sequence-similarities}

Similarity-based taxonomic assignment utilizes the local similarity of a
query sequence to sequences of known taxonomic origin. Taxonomic
identifiers are commonly assigned either by identifying the lowest
common ancestor (LCA) from the taxonomy for the taxa of the most similar
sequences found (Patil et al., 2011) or by using phylogenetic placement
methods. Phylogenetic placement methods, such as pplacer (Matsen, Kodner
\& Armbrust, 2010), EPA/RaxML (Berger, Krompass \& Stamatakis, 2011) and
SEPP (Mirarab, Nguyen \& Warnow, 2012) place the query sequence within a
fixed reference tree. The taxonomic label assigned then corresponds to
the LCA of the taxa associated with the first ancestral node's children.
Both methods are related to `nearest neighbor' classification. In both
cases, there has to be a search phase in which such similarities are
identified. Typically, local similarities to sequence database entries
are searched for with alignment programs such as BLAST (Camacho et al.,
2009). Searches for gene family or protein domain motifs in the query
sequence can be performed with a reference collection of profile Hidden
Markov Models (HMMs). HMMER 3.0, released in 2010, has a 100-fold
increase in speed compared to prior versions, with runtimes being
competitive to blastp (Finn, Clements \& Eddy, 2011). Screening a large
metagenome sample with a collection of profile HMMs for marker genes is
computationally much less demanding than a full search for similar
regions in large sequence collections (Finn, Clements \& Eddy, 2011).
This is because the number of entries to be searched against is
typically several orders of magnitude lower. HMMs are popular in
combination with phylogenetic placement approaches, as the required
multiple alignment of a query sequence to the homologs can be directly
deduced from the state path of the sequence through the HMM and the
multiple alignment used in its construction. However, known marker genes
or protein families from reference collections such as PFAM only cover a
small part of the genes found across diverse environments. Therefore,
most HMM-based approaches (Stark et al., 2010; Gerlach \& Stoye, 2011;
Wu \& Scott, 2012) may be seen as phylotypers of metagenome samples,
rather than binning methods, as they indicate the taxonomic composition
of the sample based on placement of a fraction of the fragments, rather
than assigning the entire sample. Searching for similar sequences in
large sequence collections results in a higher fragment coverage with
hits than when profile HMMs are used. Analysis of a metagenome sequence
sample therefore comes with high computational costs, beyond what a
typical desktop computer is capable of. When using a similarity search,
one is therefore confronted with the question of which reference
sequences to compare to. The choice depends on the available time and
computational resources. Databases that are often searched are NCBI
RefSeq, a non-redundant nucleotide and protein collection for medical,
functional and diversity studies; NCBI whole genomes; NCBI nt, a large
nucleotide collection; and NCBI nr, a large non-redundant protein
collection (Sayers et al., 2009). Software such as MEGAN (Huson et al.,
2011) allows the output of BLAST to be interpreted for the taxonomic and
functional characterization of metagenome samples based on sequence
similarity. If sequenced genomes of related species to the sampled taxa
exist, recruitment analysis has been used (Qin et al., 2010). Here, each
read is compared to a set of genome sequences and `recruited' to the
most similar genome, allowing the identification of reads of the
prevalent species that are closely related to a sequenced reference
collection, if performed with stringent alignment cut-offs (Xie et al.,
2010).

\subsection{Case study 1: Recruitment
analysis}\label{case-study-1-recruitment-analysis}

\emph{In (Xie et al., 2010), Illumina and Roche/454 sequencing were
jointly used to generate 860 Mb of non-human sequence data from a
microbial community of human dental plaque. All obtained reads were
aligned against 50 available reference genomes for human oral microbes
from the Human Microbiome Project using Mummer, resulting in recruitment
of 4\% of all reads with more than 97\% sequence identity to one of the
reference genomes. This indicates that most of the sampled microbes
originate from species that are too distantly related to the sequenced
reference collection for similarity-based recruitment.}

\section{Taxonomic binning based on sequence
composition}\label{taxonomic-binning-based-on-sequence-composition}

The composition-based approach to taxonomic binning is to utilize the
taxonomic signal contained in fragment-wide GC content, codon usage or
the use of short oligomers (kmers), typically 4--6 bp long. The
observation that such properties tend to vary more across the genomes of
different species than within a given one gave rise to the term genome
signatures (Karlin \& Burge, 1995; Deschavanne et al., 1999). Such
signatures can also be inferred for higher-level clades, allowing their
use for taxonomic fragment assignment across various ranks (McHardy et
al., 2007).

Taxonomic binning based on sequence composition can be performed with
supervised or unsupervised methods. The choice of which to use depends
on the availability of suitable reference data. Unsupervised methods
group fragments with similar composition profiles into clusters,
corresponding to individual taxonomic bins. Inference of the taxonomic
label for a bin can be performed based on taxonomic assignment of marker
genes found in the fragments of a bin. To infer the clustering of
fragments, existing methods use, for example, a graph-cut algorithm or
variations of a self-organizing map algorithm (Chatterji et al., 2008;
Weber et al., 2011). A sample can also be binned with supervised
methods, which assign fragments to clades using a model trained with
available reference sequences. Supervised methods tend to have higher
accuracy than unsupervised methods for taxonomic assignment and are more
easily applied to complex microbial mixtures with skewed organism
abundances. However, they require sufficient amounts of reference
sequences to be identified for the sample populations or higher-level
clades which are to be included in the model. In practice, therefore,
each approach has its own appeal and both are being applied. Methods
used for supervised classification are, for example, (structural)
Support Vector Machines (SVMs) (McHardy et al., 2007), the naïve Bayes
classifier (Rosen, Reichenberger \& Rosenfeld, 2011), a k-nearest
neighbor classifier (Diaz et al., 2009) and Interpolated Markov Models
(Brady \& Salzberg, 2009). As composition-based signatures are a global
attribute of sequences, no entire reference genomes are required, but
only sufficient amounts of sequences for inference of a
composition-based signature. For SVM-based classification, this has been
found to be around 100 kb per clade (Patil et al., 2011). Reference
sequences can be identified among publicly available genomes or by
taxonomic assignment of conserved marker-genes of the sample contigs,
which allows the respective contigs to be used as training material. If
necessary, fosmids carrying marker genes can be sequenced to generate
training material for interesting sample populations or higher-level
clades (Warnecke et al., 2007; Pope et al., 2010, 2011).

\subsection{Case study 2: Taxonomic binning by composition-based
taxonomic
assignment}\label{case-study-2-taxonomic-binning-by-composition-based-taxonomic-assignment}

\emph{In (Pope et al., 2010), a microbial gut community from the
Australian Tammar wallaby was studied by Sanger and 454 sequencing of
metagenome plasmid and fosmid libraries. This microbial community is
involved in the breakdown of plant biomass consumed by the host animal.
Using 16S rRNA analysis, 236 distinct phylotypes were observed. Of the
16S rRNA sequences, 9\% originated from a novel species, Wallaby group 1
(WG-1), in the family of Succinivibrionaceae. PhyloPythia, a
composition-based taxonomic classifier, was used to train a model
including the WG-1 and other relevant clades for species present in the
community. Composition-based taxonomic assignment of the metagenome
sample recovered a 2 Mb draft genome for WG-1. Metabolic reconstruction
based on the draft genome allowed the cultivation requirements for WG-1
to be deduced, leading to isolation, characterization and a draft genome
sequence for the previously unknown species. It also resulted in the
finding that WG-1 contributes to the low-methane emission phenotype of
plant biomass degradation in the Tammar wallaby. The draft genome
sequences from the isolate culture showed 98.9\% sequence identity to
the WG-1 metagenome bin, and 90\% of shared reads and assemblies,
indicating accurate reconstruction of the draft genome from the
metagenome sample by composition-based taxonomic binning.}

\section{Hybrid methods}\label{hybrid-methods}

Several methods combine different types of information to improve
predictive accuracy (Brady \& Salzberg, 2009; Hess et al., 2011; Huson
et al., 2011; Iverson et al., 2012). For instance, read coverage is
combined with an analysis of kmer frequencies in clustering of fragments
(Tyson et al., 2004; Hess et al., 2011). Searches for similar sequences
and analysis of linkage information from an assembly are also combined
with composition-based taxonomic assignment, if the computational burden
can be borne. This has particular advantages for short fragment
analysis. Kmer signatures for fragments below 1 kb in length,
particularly those of individual reads, are noisy, even more so than
taxonomic conservation of sequence similarities (Patil et al., 2011).

\subsection{Case study 3: Taxonomic binning based on clustering by
sequence composition and read
coverage}\label{case-study-3-taxonomic-binning-based-on-clustering-by-sequence-composition-and-read-coverage}

\emph{In one of the most in-depth metagenome studies of a particular
environment undertaken so far, 286 Gb of paired-end Illumina sequence
reads were generated from a sample of the plant-fiber adherent
microbiome from a cow rumen (Hess et al., 2011). Rarefaction analysis of
16S rRNA indicated the presence of \textasciitilde{}1000 distinct OTUs.
Clustering of assembled contigs by agglomerative hierarchical
clustering, based on tetramer frequencies and read coverage, resulted in
the formation of 466 taxonomic bins. Fifteen of these were estimated to
represent largely complete genomes (between 60\% and 92\%), based on
their association with fully sequenced genomes from their respective
clades. This estimate was based on the presence of a minimal set of core
genes found in all sequenced genomes from the respective phylogenetic
order.}

\subsection{Case study 4: Taxonomic binning based on assembly
information and sequence composition (Iverson et al.,
2012).}\label{case-study-4-taxonomic-binning-based-on-assembly-information-and-sequence-composition-iversonuntangling2012.}

\emph{SOLID sequencing of two marine samples generated 58.5 Gb of
mate-paired reads of 50 bps in length. The number of phylotypes observed
with16S rRNA analysis was not specified in detail; however, family-level
taxonomic groups were observed with abundances of less than 10\%. From
the metagenome data, 300 Mb of contigs were assembled. Scaffolds --
linked sets of contigs assumed to originate from one genome -- were
generated by splitting the assembly graph, which links contigs based on
mate-pair information, according to mate-pair linkage scores, read
coverage and tetranucleotide usage. Scaffold clustering by
tetranucleotide usage generated 14 partial genome reconstructions from
the two samples, for populations ranging in abundance from 4\% to 10 \%
each in one of the samples. Reassembly of 11 mate-pair connected
scaffolds that are binned together based on similar tetranucleotide
statistics and manual gap closure allowed the recovery of a closed
circular 2 Mb genome from an uncultured group, the marine group II
Euryarchaeota.}

\section{Advantages and disadvantages of different binning
approaches}\label{advantages-and-disadvantages-of-different-binning-approaches}

Which binning methodology to use depends on multiple factors, such as
the complexity of the analyzed microbial community, available reference
sequences and computing ressources. For taxonomic assignment of
arbitrary sequence fragments to a particular species based on sequence
similarity, completely sequenced reference genomes of closely related
taxa are ideally required, which are often not available. If no
reference data exists for the species of the metagenome sample,
homology-based taxonomic assignment to higher-level clades is more
accurate than composition-based taxonomic assignment for short fragments
of 1 kb or less (Patil et al., 2011). This length corresponds to
individual reads with most sequencing technologies. The assignment of
individual reads in general is, however, notably less accurate than
assignment of longer fragments.

The runtime of sequence similarity searches increases proportional to
the product of the metagenome sample size (number and length of contigs)
and the size of the reference sequence collection. This makes it a
computationally very demanding task for next-generation sequencing data
sets. The required computing resources are not available in many
experimental laboratories. If researchers are willing to submit their
data to external facilities, data can be processed by web servers such
as MG-RAST, IMG-M or CAMERA, which offer their computational resources
to the community.

The choice of whether to cluster or classify based on sequence
composition depends on availability of some reference data to train a
composition-based classifier. Classification is likely to be more
accurate than clustering in taxonomic assignment. However, if no
reference data is avaible, clustering will allow resolution of taxonomic
bins which otherweise would go undetected. If multiple types of
information are included into the binning process, like it is done in
hybrid approaches, this is likely to increase the overall amount and
accuracy of assignments. Composition-based taxonomic assignment requires
less reference sequences than homology-based assignment. This is because
sequence composition is a globally conserved property, while sequence
similarity depends on local sequence conservation between a query and
target. Training times of a composition-based taxonomic classifier
depend on the method used, but it requires typically considerably less
time than searching a reference sequence collection. Once a
composition-based model for taxonomic classification has been trained,
execution times for classification again typically scale linearly with
the metagenome sample size and are independent of a reference sequence
collection. For composition-based clustering, no training phase is
needed. The runtime of clustering typically scales at least
quadratically with the sample size, as it often involves pairwise
comparisons.

\section{Future directions}\label{future-directions}

The recent developments in sequencing technologies have considerably
pushed the boundaries in terms of what can be learned from metagenome
sequence samples. The high sequencing depth of microbial communities, in
combination with the application of sophisticated algorithms, has
allowed the retrieval of near-complete draft genomes from the
metagenomes of many microbial communities, including highly complex
ones, such as those found in soil (Mackelprang et al., 2011). However,
the size and heterogeneity of the different data types produced by the
various novel techniques have created new challenges, which remain to be
addressed. A prominent one is how to further reduce the computational
requirements of searching for local similarities between giga- or even
terabase-sized sequence samples and equivalently large reference
sequence collections. Secondly, it remains to be explored how taxonomic
assignment accuracy can be further improved for the vast majority of
microbial community members that are only distantly related to sequenced
isolate genomes. Due to the value of available sequences from related
taxa for the taxonomic binning of a particular sample, efforts such as
GEBA might help in this regard (Wu et al., 2009). The GEBA project aims
to construct a ``Genomic Encyclopedia for Bacteria and Archaea'' by
strategic sequencing of microbial genomes from all major and minor
taxonomic groups. As the cost of sequencing has decreased, partial
genome reconstruction by single-cell genome sequencing is an attractive
option for obtaining reference sequences for taxonomic binning and draft
genome reconstruction (Woyke et al., 2010) from metagenomes. Here, an
individual cell from a microbial population within a community is
isolated using techniques such as optical tweezers, fluorescence
assisted cell sorting and others, and is then lysed and its genome
sequence amplified with multiple displacement amplification prior to
random shotgun sequencing.

Advances in single-molecule sequencing technologies now allow longer
reads to be generated than what was possible using traditional Sanger
sequencing. Even though this promises to resolve several issues
associated with short read analysis, such as high error rates in
binning, assembly and functional annotation, the larger sequencing error
of some of these technologies, currently estimated to be around 15\%,
presents a different substantial hurdle. Therefore, assessing
technology-specific errors and developing technology-specific denoising
procedures, such as have been developed for 454 amplicon data (Quince et
al., 2009), will be prerequisite to leveraging the value of these
techniques for metagenome research. An interesting research direction is
to investigate whether composition-based binning is applicable for the
analysis of samples with both microbial and viral content.
Composition-based taxonomic binning has been successfully applied for
the analysis of viral metagenome samples, however, bacteriophage codon
usage to some extent reflects properties of the host (Pride \&
Schoenfeld, 2008; Lucks et al., 2008). Therefore, classification
accuracy and level of taxonomic resolution attainable for viral taxa
will have to be investigated in more detail.

\section{Summary of key points}\label{summary-of-key-points}

NGS technologies generate massive amounts of sequencing data allowing
the in-depth analysis of microbial communities. Taxonomic binning has
allowed draft genomes of microbial species from many environments to be
reconstructed, and the cultivation requirements of a novel uncultured
species to be deduced. To further advance draft genome reconstruction
from metagenome samples, the existing techniques could be further
refined by integrating multiple sources of information and by
appropriately denoising the data under consideration to remove
technology-specific sequencing errors.

\section{Acknowledgements}\label{acknowledgements}

We thank Chris Quince and Alex Scyrba for providing comments.

\section{Funding}\label{funding}

This work was supported by the German Max Planck society and
Heinrich-Heine University Düsseldorf.

\section{References}\label{references}

\setlength{\parindent}{0cm} \everypar{\hangindent=.5cm}

\hypertarget{refs}{}
\hypertarget{ref-AmannPhylogenetic1995}{}
\textbf{Amann RI., Ludwig W., Schleifer KH.} \textbf{1995}. Phylogenetic
identification and in situ detection of individual microbial cells
without cultivation. \emph{Microbiological reviews} 59:143--69.

\hypertarget{ref-BergerPerformance2011}{}
\textbf{Berger SA., Krompass D., Stamatakis A.} \textbf{2011}.
Performance, accuracy, and web server for evolutionary placement of
short sequence reads under maximum likelihood. \emph{Systematic biology}
60:291--302. DOI:
\href{https://doi.org/10.1093/sysbio/syr010}{10.1093/sysbio/syr010}.

\hypertarget{ref-BradyPhymm2009}{}
\textbf{Brady A., Salzberg SL.} \textbf{2009}. Phymm and PhymmBL:
Metagenomic phylogenetic classification with interpolated Markov models.
\emph{Nature methods} 6:673--6. DOI:
\href{https://doi.org/10.1038/nmeth.1358}{10.1038/nmeth.1358}.

\hypertarget{ref-CamachoBlast2009}{}
\textbf{Camacho C., Coulouris G., Avagyan V., Ma N., Papadopoulos J.,
Bealer K., Madden TL.} \textbf{2009}. BLAST+: Architecture and
applications. \emph{BMC bioinformatics} 10:421. DOI:
\href{https://doi.org/10.1186/1471-2105-10-421}{10.1186/1471-2105-10-421}.

\hypertarget{ref-ChatterjiCompostbin2008}{}
\textbf{Chatterji S., Yamazaki I., Bai Z., Eisen JA.} \textbf{2008}.
CompostBin: A DNA composition-based algorithm for binning environmental
shotgun reads. In: \emph{Annual International Conference on Research in
Computational Molecular Biology}. Springer, 17--28.

\hypertarget{ref-ColeRibosomal2009}{}
\textbf{Cole JR., Wang Q., Cardenas E., Fish J., Chai B., Farris RJ.,
Kulam-Syed-Mohideen AS., McGarrell DM., Marsh T., Garrity GM., Tiedje
JM.} \textbf{2009}. The Ribosomal Database Project: Improved alignments
and new tools for rRNA analysis. \emph{Nucleic Acids Research}
37:D141--D145. DOI:
\href{https://doi.org/10.1093/nar/gkn879}{10.1093/nar/gkn879}.

\hypertarget{ref-DesantisGreengenes2006}{}
\textbf{DeSantis TZ., Hugenholtz P., Larsen N., Rojas M., Brodie EL.,
Keller K., Huber T., Dalevi D., Hu P., Andersen GL.} \textbf{2006}.
Greengenes, a chimera-checked 16S rRNA gene database and workbench
compatible with ARB. \emph{Applied and environmental microbiology}
72:5069--72. DOI:
\href{https://doi.org/10.1128/AEM.03006-05}{10.1128/AEM.03006-05}.

\hypertarget{ref-DeschavanneGenomic1999}{}
\textbf{Deschavanne PJ., Giron a., Vilain J., Fagot G., Fertil B.}
\textbf{1999}. Genomic signature: Characterization and classification of
species assessed by chaos game representation of sequences.
\emph{Molecular biology and evolution} 16:1391--9.

\hypertarget{ref-DiazTacoa2009}{}
\textbf{Diaz NN., Krause L., Goesmann A., Niehaus K., Nattkemper TW.}
\textbf{2009}. TACOA: Taxonomic classification of environmental genomic
fragments using a kernelized nearest neighbor approach. \emph{BMC
bioinformatics} 10:56. DOI:
\href{https://doi.org/10.1186/1471-2105-10-56}{10.1186/1471-2105-10-56}.

\hypertarget{ref-FinnHmmer2011}{}
\textbf{Finn RD., Clements J., Eddy SR.} \textbf{2011}. HMMER web
server: Interactive sequence similarity searching. \emph{Nucleic acids
research} 39 Suppl 2:W29--37. DOI:
\href{https://doi.org/10.1093/nar/gkr367}{10.1093/nar/gkr367}.

\hypertarget{ref-FuhrmanMicrobial2009}{}
\textbf{Fuhrman J a.} \textbf{2009}. Microbial community structure and
its functional implications. \emph{Nature} 459:193--9. DOI:
\href{https://doi.org/10.1038/nature08058}{10.1038/nature08058}.

\hypertarget{ref-GerlachTaxonomic2011}{}
\textbf{Gerlach W., Stoye J.} \textbf{2011}. Taxonomic classification of
metagenomic shotgun sequences with CARMA3. \emph{Nucleic acids
research}:1--11. DOI:
\href{https://doi.org/10.1093/nar/gkr225}{10.1093/nar/gkr225}.

\hypertarget{ref-HessMetagenomic2011}{}
\textbf{Hess M., Sczyrba A., Egan R., Kim T-W., Chokhawala H., Schroth
G., Luo S., Clark DS., Chen F., Zhang T., Mackie RI., Pennacchio L a.,
Tringe SG., Visel A., Woyke T., Wang Z., Rubin EM.} \textbf{2011}.
Metagenomic discovery of biomass-degrading genes and genomes from cow
rumen. \emph{Science (New York, N.Y.)} 331:463--7. DOI:
\href{https://doi.org/10.1126/science.1200387}{10.1126/science.1200387}.

\hypertarget{ref-HugenholtzExploring2002}{}
\textbf{Hugenholtz P.} \textbf{2002}. Exploring prokaryotic diversity in
the genomic era. \emph{Genome biology} 3:REVIEWS0003.

\hypertarget{ref-HuseExploring2008}{}
\textbf{Huse SM., Dethlefsen L., Huber JA., Welch DM., Relman DA., Sogin
ML.} \textbf{2008}. Exploring microbial diversity and taxonomy using SSU
rRNA hypervariable tag sequencing. \emph{PLoS genetics} 4:e1000255. DOI:
\href{https://doi.org/10.1371/journal.pgen.1000255}{10.1371/journal.pgen.1000255}.

\hypertarget{ref-HusonIntegrative2011}{}
\textbf{Huson DH., Mitra S., Ruscheweyh H-J., Weber N., Schuster SC.}
\textbf{2011}. Integrative analysis of environmental sequences using
MEGAN4. \emph{Genome research} 21:1552--60. DOI:
\href{https://doi.org/10.1101/gr.120618.111}{10.1101/gr.120618.111}.

\hypertarget{ref-IversonUntangling2012}{}
\textbf{Iverson V., Morris RM., Frazar CD., Berthiaume CT., Morales RL.,
Armbrust EV.} \textbf{2012}. Untangling genomes from metagenomes:
Revealing an uncultured class of marine Euryarchaeota. \emph{Science}
335:587--590. DOI:
\href{https://doi.org/10.1126/science.1212665}{10.1126/science.1212665}.

\hypertarget{ref-JeewonDetection2007}{}
\textbf{Jeewon R., Hyde KD.} \textbf{2007}. Detection and diversity of
fungi from environmental samples: Traditional versus molecular
approaches. \emph{Advanced techniques in soil microbiology} 11:1--15.

\hypertarget{ref-KarlinDinucleotide1995}{}
\textbf{Karlin S., Burge C.} \textbf{1995}. Dinucleotide relative
abundance extremes: A genomic signature. \emph{Trends in genetics}
11:283--90.

\hypertarget{ref-KorenBambus2011}{}
\textbf{Koren S., Treangen TJ., Pop M.} \textbf{2011}. Bambus 2:
Scaffolding metagenomes. \emph{Bioinformatics} 27:2964--2971. DOI:
\href{https://doi.org/10.1093/bioinformatics/btr520}{10.1093/bioinformatics/btr520}.

\hypertarget{ref-KuninBioinformatician2008}{}
\textbf{Kunin V., Copeland A., Lapidus A., Mavromatis K., Hugenholtz P.}
\textbf{2008}. A Bioinformatician's Guide to Metagenomics.
\emph{Microbiology and Molecular Biology Reviews} 72:557--578. DOI:
\href{https://doi.org/10.1128/MMBR.00009-08}{10.1128/MMBR.00009-08}.

\hypertarget{ref-LasersonGenovo2011}{}
\textbf{Laserson J., Jojic V., Koller D.} \textbf{2011}. Genovo: De novo
assembly for metagenomes. \emph{Journal of computational biology}
18:429--43. DOI:
\href{https://doi.org/10.1089/cmb.2010.0244}{10.1089/cmb.2010.0244}.

\hypertarget{ref-LucksGenome2008}{}
\textbf{Lucks JB., Nelson DR., Kudla GR., Plotkin JB.} \textbf{2008}.
Genome landscapes and bacteriophage codon usage. \emph{PLoS
computational biology} 4:e1000001. DOI:
\href{https://doi.org/10.1371/journal.pcbi.1000001}{10.1371/journal.pcbi.1000001}.

\hypertarget{ref-MackelprangMetagenomic2011}{}
\textbf{Mackelprang R., Waldrop MP., DeAngelis KM., David MM., Chavarria
KL., Blazewicz SJ., Rubin EM., Jansson JK.} \textbf{2011}. Metagenomic
analysis of a permafrost microbial community reveals a rapid response to
thaw. \emph{Nature} 480:368--371. DOI:
\href{https://doi.org/10.1038/nature10576}{10.1038/nature10576}.

\hypertarget{ref-MardisImpact2008}{}
\textbf{Mardis ER.} \textbf{2008}. The impact of next-generation
sequencing technology on genetics. \emph{Trends in genetics : TIG}
24:133--41. DOI:
\href{https://doi.org/10.1016/j.tig.2007.12.007}{10.1016/j.tig.2007.12.007}.

\hypertarget{ref-MarkowitzImg2012}{}
\textbf{Markowitz VM., Chen I-MA., Chu K., Szeto E., Palaniappan K.,
Grechkin Y., Ratner A., Jacob B., Pati A., Huntemann M., Liolios K.,
Pagani I., Anderson I., Mavromatis K., Ivanova NN., Kyrpides NC.}
\textbf{2012}. IMG/M: The integrated metagenome data management and
comparative analysis system. \emph{Nucleic Acids Research}
40:D123--D129. DOI:
\href{https://doi.org/10.1093/nar/gkr975}{10.1093/nar/gkr975}.

\hypertarget{ref-MatsenPplacer2010}{}
\textbf{Matsen FA., Kodner RB., Armbrust EV.} \textbf{2010}. Pplacer:
Linear time maximum-likelihood and Bayesian phylogenetic placement of
sequences onto a fixed reference tree. \emph{BMC bioinformatics} 11:538.
DOI:
\href{https://doi.org/10.1186/1471-2105-11-538}{10.1186/1471-2105-11-538}.

\hypertarget{ref-MchardyAccurate2007}{}
\textbf{McHardy AC., Martín HG., Tsirigos A., Hugenholtz P., Rigoutsos
I.} \textbf{2007}. Accurate phylogenetic classification of
variable-length DNA fragments. \emph{Nature methods} 4:63--72. DOI:
\href{https://doi.org/10.1038/nmeth976}{10.1038/nmeth976}.

\hypertarget{ref-MetzkerSequencing2009}{}
\textbf{Metzker ML.} \textbf{2009}. Sequencing technologies --- the next
generation. \emph{Nature reviews genetics} 11:31--46. DOI:
\href{https://doi.org/10.1038/nrg2626}{10.1038/nrg2626}.

\hypertarget{ref-MeyerMetagenomics2008}{}
\textbf{Meyer F., Paarmann D., D'Souza M., Olson R., Glass EM., Kubal
M., Paczian T., Rodriguez A., Stevens R., Wilke A., Wilkening J.,
Edwards R a.} \textbf{2008}. The metagenomics RAST server - a public
resource for the automatic phylogenetic and functional analysis of
metagenomes. \emph{BMC bioinformatics} 9:386. DOI:
\href{https://doi.org/10.1186/1471-2105-9-386}{10.1186/1471-2105-9-386}.

\hypertarget{ref-MillerAssembly2010}{}
\textbf{Miller JR., Koren S., Sutton G.} \textbf{2010}. Assembly
algorithms for next-generation sequencing data. \emph{Genomics}
95:315--27. DOI:
\href{https://doi.org/10.1016/j.ygeno.2010.03.001}{10.1016/j.ygeno.2010.03.001}.

\hypertarget{ref-MirarabSepp2012}{}
\textbf{Mirarab S., Nguyen N., Warnow T.} \textbf{2012}. SEPP:
SATé-Enabled Phylogenetic Placement. \emph{Pacific Symposium on
Biocomputing. Pacific Symposium on Biocomputing}:247--58.

\hypertarget{ref-NelsonCatalog2010}{}
\textbf{Nelson KE., Weinstock GM., Highlander SK., Worley KC., Creasy
HH., Wortman JR., Rusch DB., Mitreva M., Sodergren E., Chinwalla AT.,
Feldgarden M., Gevers D., Haas BJ., Madupu R., Ward DV., Birren BW.,
Gibbs R a., Methe B., Petrosino JF., Strausberg RL., Sutton GG., White
OR., Wilson RK., Durkin S., Giglio MG., Gujja S., Howarth C., Kodira
CD., Kyrpides N., Mehta T., Muzny DM., Pearson M., Pepin K., Pati A.,
Qin X., Yandava C., Zeng Q., Zhang L., Berlin AM., Chen L., Hepburn T
a., Johnson J., McCorrison J., Miller J., Minx P., Nusbaum C., Russ C.,
Sykes SM., Tomlinson CM., Young S., Warren WC., Badger J., Crabtree J.,
Markowitz VM., Orvis J., Cree A., Ferriera S., Fulton LL., Fulton RS.,
Gillis M., Hemphill LD., Joshi V., Kovar C., Torralba M., Wetterstrand K
a., Abouellleil A., Wollam AM., Buhay CJ., Ding Y., Dugan S., FitzGerald
MG., Holder M., Hostetler J., Clifton SW., Allen-Vercoe E., Earl AM.,
Farmer CN., Liolios K., Surette MG., Xu Q., Pohl C., Wilczek-Boney K.,
Zhu D.} \textbf{2010}. A catalog of reference genomes from the human
microbiome. \emph{Science (New York, N.Y.)} 328:994--9. DOI:
\href{https://doi.org/10.1126/science.1183605}{10.1126/science.1183605}.

\hypertarget{ref-PatilTaxonomic2011}{}
\textbf{Patil KR., Haider P., Pope PB., Turnbaugh PJ., Morrison M.,
Scheffer T., McHardy AC.} \textbf{2011}. Taxonomic metagenome sequence
assignment with structured output models. \emph{Nature Methods}
8:191--192. DOI:
\href{https://doi.org/10.1038/nmeth0311-191}{10.1038/nmeth0311-191}.

\hypertarget{ref-PellScaling2012}{}
\textbf{Pell J., Hintze A., Canino-Koning R., Howe A., Tiedje J., Brown
C.} \textbf{2012}. Scaling metagenome sequence assembly with
probabilistic de Bruijn graphs. \emph{Arxiv preprint arXiv:1112.4193}
I:1--11. DOI:
\href{https://doi.org/10.1073/pnas.1121464109}{10.1073/pnas.1121464109}.

\hypertarget{ref-PengMetaidba2011}{}
\textbf{Peng Y., Leung HCM., Yiu SM., Chin FYL.} \textbf{2011}.
Meta-IDBA: A de novo assembler for metagenomic data.
\emph{Bioinformatics (Oxford, England)} 27:i94--i101. DOI:
\href{https://doi.org/10.1093/bioinformatics/btr216}{10.1093/bioinformatics/btr216}.

\hypertarget{ref-PopeAdaptation2010}{}
\textbf{Pope PB., Denman SE., Jones M., Tringe SG., Barry K., Malfatti S
a., McHardy a C., Cheng J-F., Hugenholtz P., McSweeney CS., Morrison M.}
\textbf{2010}. Adaptation to herbivory by the Tammar wallaby includes
bacterial and glycoside hydrolase profiles different from other
herbivores. \emph{Proceedings of the National Academy of Sciences of the
United States of America} 107:14793--8. DOI:
\href{https://doi.org/10.1073/pnas.1005297107}{10.1073/pnas.1005297107}.

\hypertarget{ref-PopeIsolation2011}{}
\textbf{Pope PB., Smith W., Denman SE., Tringe SG., Barry K., Hugenholtz
P., McSweeney CS., McHardy a C., Morrison M.} \textbf{2011}. Isolation
of Succinivibrionaceae implicated in low methane emissions from Tammar
wallabies. \emph{Science (New York, N.Y.)} 333:646--8. DOI:
\href{https://doi.org/10.1126/science.1205760}{10.1126/science.1205760}.

\hypertarget{ref-PrideGenome2008}{}
\textbf{Pride DT., Schoenfeld T.} \textbf{2008}. Genome signature
analysis of thermal virus metagenomes reveals Archaea and thermophilic
signatures. \emph{BMC genomics} 9:420. DOI:
\href{https://doi.org/10.1186/1471-2164-9-420}{10.1186/1471-2164-9-420}.

\hypertarget{ref-PruesseSilva2007}{}
\textbf{Pruesse E., Quast C., Knittel K., Fuchs BM., Ludwig W., Peplies
J., Glöckner FO.} \textbf{2007}. SILVA: A comprehensive online resource
for quality checked and aligned ribosomal RNA sequence data compatible
with ARB. \emph{Nucleic acids research} 35:7188--96. DOI:
\href{https://doi.org/10.1093/nar/gkm864}{10.1093/nar/gkm864}.

\hypertarget{ref-QinHuman2010}{}
\textbf{Qin J., Li R., Raes J., Arumugam M., Burgdorf KS., Manichanh C.,
Nielsen T., Pons N., Levenez F., Yamada T., Mende DR., Li J., Xu J., Li
S., Li D., Cao J., Wang B., Liang H., Zheng H., Xie Y., Tap J., Lepage
P., Bertalan M., Batto J-M., Hansen T., Le Paslier D., Linneberg A.,
Nielsen HB., Pelletier E., Renault P., Sicheritz-Ponten T., Turner K.,
Zhu H., Yu C., Li S., Jian M., Zhou Y., Li Y., Zhang X., Li S., Qin N.,
Yang H., Wang J., Brunak S., Doré J., Guarner F., Kristiansen K.,
Pedersen O., Parkhill J., Weissenbach J., Bork P., Ehrlich SD., Wang J.}
\textbf{2010}. A human gut microbial gene catalogue established by
metagenomic sequencing. \emph{Nature} 464:59--65. DOI:
\href{https://doi.org/10.1038/nature08821}{10.1038/nature08821}.

\hypertarget{ref-QuinceAccurate2009}{}
\textbf{Quince C., Lanzén A., Curtis TP., Davenport RJ., Hall N., Head
IM., Read LF., Sloan WT.} \textbf{2009}. Accurate determination of
microbial diversity from 454 pyrosequencing data. \emph{Nature methods}
6:639--41. DOI:
\href{https://doi.org/10.1038/nmeth.1361}{10.1038/nmeth.1361}.

\hypertarget{ref-RosenNbc2011}{}
\textbf{Rosen GL., Reichenberger ER., Rosenfeld AM.} \textbf{2011}. NBC:
The Naive Bayes Classification tool webserver for taxonomic
classification of metagenomic reads. \emph{Bioinformatics (Oxford,
England)} 27:127--9. DOI:
\href{https://doi.org/10.1093/bioinformatics/btq619}{10.1093/bioinformatics/btq619}.

\hypertarget{ref-SayersDatabase2009}{}
\textbf{Sayers EW., Barrett T., Benson D., Bryant SH., Canese K.,
Chetvernin V., Church DM., DiCuccio M., Edgar R., Federhen S., Feolo M.,
Geer LY., Helmberg W., Kapustin Y., Landsman D., Lipman DJ., Madden TL.,
Maglott DR., Miller V., Mizrachi I., Ostell J., Pruitt KD., Schuler GD.,
Sequeira E., Sherry ST., Shumway M., Sirotkin K., Souvorov A.,
Starchenko G., Tatusova T a., Wagner L., Yaschenko E., Ye J.}
\textbf{2009}. Database resources of the National Center for
Biotechnology Information. \emph{Nucleic acids research} 37:D5--15. DOI:
\href{https://doi.org/10.1093/nar/gkn741}{10.1093/nar/gkn741}.

\hypertarget{ref-SchadtWindow2010}{}
\textbf{Schadt EE., Turner S., Kasarskis A.} \textbf{2010}. A window
into third-generation sequencing. \emph{Human molecular genetics}
19:R227--40. DOI:
\href{https://doi.org/10.1093/hmg/ddq416}{10.1093/hmg/ddq416}.

\hypertarget{ref-SimonMetagenomic2011}{}
\textbf{Simon C., Daniel R.} \textbf{2011}. Metagenomic Analyses: Past
and Future Trends. \emph{Applied and Environmental Microbiology}
77:1153--1161. DOI:
\href{https://doi.org/10.1128/AEM.02345-10}{10.1128/AEM.02345-10}.

\hypertarget{ref-StarkMltreemap2010}{}
\textbf{Stark M., Berger S., Stamatakis A., von Mering C.}
\textbf{2010}. MLTreeMap - accurate maximum likelihood placement of
environmental DNA sequences into taxonomic and functional reference
phylogenies. \emph{BMC genomics} 11:461. DOI:
\href{https://doi.org/10.1186/1471-2164-11-461}{10.1186/1471-2164-11-461}.

\hypertarget{ref-SuMetaabc2011}{}
\textbf{Su C-H., Hsu M-T., Wang T-Y., Chiang S., Cheng J-H., Weng FC.,
Kao C-Y., Wang D., Tsai H-K.} \textbf{2011}. MetaABC--an integrated
metagenomics platform for data adjustment, binning and clustering.
\emph{Bioinformatics (Oxford, England)} 27:2298--9. DOI:
\href{https://doi.org/10.1093/bioinformatics/btr376}{10.1093/bioinformatics/btr376}.

\hypertarget{ref-SuenInsect2010}{}
\textbf{Suen G., Scott JJ., Aylward FO., Adams SM., Tringe SG.,
Pinto-Tomás A a., Foster CE., Pauly M., Weimer PJ., Barry KW., Goodwin L
a., Bouffard P., Li L., Osterberger J., Harkins TT., Slater SC., Donohue
TJ., Currie CR.} \textbf{2010}. An insect herbivore microbiome with high
plant biomass-degrading capacity. \emph{PLoS genetics} 6:e1001129. DOI:
\href{https://doi.org/10.1371/journal.pgen.1001129}{10.1371/journal.pgen.1001129}.

\hypertarget{ref-SunCommunity2011}{}
\textbf{Sun S., Chen J., Li W., Altintas I., Lin A., Peltier S., Stocks
K., Allen EE., Ellisman M., Grethe J., Wooley J.} \textbf{2011}.
Community cyberinfrastructure for Advanced Microbial Ecology Research
and Analysis: The CAMERA resource. \emph{Nucleic Acids Research}
39:D546--D551. DOI:
\href{https://doi.org/10.1093/nar/gkq1102}{10.1093/nar/gkq1102}.

\hypertarget{ref-ThompsonProperties2011}{}
\textbf{Thompson JF., Milos PM.} \textbf{2011}. The properties and
applications of single-molecule DNA sequencing. \emph{Genome biology}
12:217. DOI:
\href{https://doi.org/10.1186/gb-2011-12-2-217}{10.1186/gb-2011-12-2-217}.

\hypertarget{ref-TringeComparative2005}{}
\textbf{Tringe SG., von Mering C., Kobayashi A., a Salamov A., Chen K.,
Chang HW., Podar M., Short JM., Mathur EJ., Detter JC., Bork P.,
Hugenholtz P., Rubin EM.} \textbf{2005}. Comparative metagenomics of
microbial communities. \emph{Science (New York, N.Y.)} 308:554--7. DOI:
\href{https://doi.org/10.1126/science.1107851}{10.1126/science.1107851}.

\hypertarget{ref-TurnbaughOrganismal2010}{}
\textbf{Turnbaugh PJ., Quince C., Faith JJ., McHardy AC., Yatsunenko T.,
Niazi F., Affourtit J., Egholm M., Henrissat B., Knight R., Gordon JI.}
\textbf{2010}. Organismal, genetic, and transcriptional variation in the
deeply sequenced gut microbiomes of identical twins. \emph{Proceedings
of the National Academy of Sciences of the United States of America}
107:7503--8. DOI:
\href{https://doi.org/10.1073/pnas.1002355107}{10.1073/pnas.1002355107}.

\hypertarget{ref-TysonCommunity2004}{}
\textbf{Tyson GW., Chapman J., Hugenholtz P., Allen EE., Ram RJ.,
Richardson PM., Solovyev VV., Rubin EM., Rokhsar DS., Banfield JF.}
\textbf{2004}. Community structure and metabolism through reconstruction
of microbial genomes from the environment. \emph{Nature} 428:37--43.
DOI: \href{https://doi.org/10.1038/nature02340}{10.1038/nature02340}.

\hypertarget{ref-VenterEnvironmental2004}{}
\textbf{Venter JC., Remington K., Heidelberg JF., Halpern AL., Rusch D.,
Eisen J a., Wu D., Paulsen I., Nelson KE., Nelson W., Fouts DE., Levy
S., Knap AH., Lomas MW., Nealson K., White O., Peterson J., Hoffman J.,
Parsons R., Baden-Tillson H., Pfannkoch C., Rogers Y-H., Smith HO.}
\textbf{2004}. Environmental genome shotgun sequencing of the Sargasso
Sea. \emph{Science (New York, N.Y.)} 304:66--74. DOI:
\href{https://doi.org/10.1126/science.1093857}{10.1126/science.1093857}.

\hypertarget{ref-WarneckeMetagenomic2007}{}
\textbf{Warnecke F., Luginbühl P., Ivanova N., Ghassemian M., Richardson
TH., Stege JT., Cayouette M., McHardy AC., Djordjevic G., Aboushadi N.,
Sorek R., Tringe SG., Podar M., Martin HG., Kunin V., Dalevi D.,
Madejska J., Kirton E., Platt D., Szeto E., Salamov A., Barry K.,
Mikhailova N., Kyrpides NC., Matson EG., Ottesen E a., Zhang X.,
Hernández M., Murillo C., Acosta LG., Rigoutsos I., Tamayo G., Green
BD., Chang C., Rubin EM., Mathur EJ., Robertson DE., Hugenholtz P.,
Leadbetter JR.} \textbf{2007}. Metagenomic and functional analysis of
hindgut microbiota of a wood-feeding higher termite. \emph{Nature}
450:560--5. DOI:
\href{https://doi.org/10.1038/nature06269}{10.1038/nature06269}.

\hypertarget{ref-WeberPractical2011}{}
\textbf{Weber M., Teeling H., Huang S., Waldmann J., Kassabgy M., Fuchs
BM., Klindworth A., Klockow C., Wichels A., Gerdts G., Amann R.,
Glöckner FO.} \textbf{2011}. Practical application of self-organizing
maps to interrelate biodiversity and functional data in NGS-based
metagenomics. \emph{The ISME journal} 5:918--28. DOI:
\href{https://doi.org/10.1038/ismej.2010.180}{10.1038/ismej.2010.180}.

\hypertarget{ref-WilkeningUsing2009}{}
\textbf{Wilkening J., Wilke A., Desai N., Meyer F.} \textbf{2009}. Using
clouds for metagenomics: A case study. In: \emph{2009 IEEE International
Conference on Cluster Computing and Workshops}. IEEE, 1--6. DOI:
\href{https://doi.org/10.1109/CLUSTR.2009.5289187}{10.1109/CLUSTR.2009.5289187}.

\hypertarget{ref-WoykeSymbiosis2006}{}
\textbf{Woyke T., Teeling H., Ivanova NN., Huntemann M., Richter M.,
Gloeckner FO., Boffelli D., Anderson IJ., Barry KW., Shapiro HJ., Szeto
E., Kyrpides NC., Mussmann M., Amann R., Bergin C., Ruehland C., Rubin
EM., Dubilier N.} \textbf{2006}. Symbiosis insights through metagenomic
analysis of a microbial consortium. \emph{Nature} 443:950--5. DOI:
\href{https://doi.org/10.1038/nature05192}{10.1038/nature05192}.

\hypertarget{ref-WoykeOne2010}{}
\textbf{Woyke T., Tighe D., Mavromatis K., Clum A., Copeland A.,
Schackwitz W., Lapidus A., Wu D., McCutcheon JP., McDonald BR., Moran N
a., Bristow J., Cheng J-F.} \textbf{2010}. One bacterial cell, one
complete genome. \emph{PloS one} 5:e10314. DOI:
\href{https://doi.org/10.1371/journal.pone.0010314}{10.1371/journal.pone.0010314}.

\hypertarget{ref-WuPhylogenomic2012}{}
\textbf{Wu M., Scott AJ.} \textbf{2012}. Phylogenomic Analysis of
Bacterial and Archaeal Sequences with AMPHORA2. \emph{Bioinformatics
(Oxford, England)}:1--2. DOI:
\href{https://doi.org/10.1093/bioinformatics/bts079}{10.1093/bioinformatics/bts079}.

\hypertarget{ref-WuPhylogenydriven2009}{}
\textbf{Wu D., Hugenholtz P., Mavromatis K., Pukall R., Dalin E.,
Ivanova NN., Kunin V., Goodwin L., Wu M., Tindall BJ., Hooper SD., Pati
A., Lykidis A., Spring S., Anderson IJ., D'haeseleer P., Zemla A.,
Singer M., Lapidus A., Nolan M., Copeland A., Han C., Chen F., Cheng
J-F., Lucas S., Kerfeld C., Lang E., Gronow S., Chain P., Bruce D.,
Rubin EM., Kyrpides NC., Klenk H-P., Eisen J a.} \textbf{2009}. A
phylogeny-driven genomic encyclopaedia of Bacteria and Archaea.
\emph{Nature} 462:1056--60. DOI:
\href{https://doi.org/10.1038/nature08656}{10.1038/nature08656}.

\hypertarget{ref-XieCommunity2010}{}
\textbf{Xie G., Chain PSG., Lo C-C., Liu K-L., Gans J., Merritt J., Qi
F.} \textbf{2010}. Community and gene composition of a human dental
plaque microbiota obtained by metagenomic sequencing. \emph{Molecular
oral microbiology} 25:391--405. DOI:
\href{https://doi.org/10.1111/j.2041-1014.2010.00587.x}{10.1111/j.2041-1014.2010.00587.x}.
\end{appendices}

\end{document}