adam-gfa
Graphical Fragment Assembly (GFA) support for ADAM.
Hacking adam-gfa
Install
- JDK 1.8 or later, http://openjdk.java.net
- Apache Maven 3.3.9 or later, http://maven.apache.org
- Apache Spark 2.4.6 or later, built for Scala 2.11 http://spark.apache.org
To build
$ mvn install
Running adam-gfa
Read and write Graphical Fragment Assembly (GFA) version 1.0
$ spark-submit \
--class com.github.heuermh.adam.gfa.Gfa1 \
target/adam-gfa_2.11-${version}.jar \
in.gfa \
out.gfa
Transform GFA 1.0 to generic Gfa1Record
records in Parquet format
$ spark-submit \
--class com.github.heuermh.adam.gfa.Gfa1ToDataframe \
target/adam-gfa_2.11-${version}.jar \
in.gfa \
out.parquet
Transform GFA 1.0 to specific Link
, Path
, Segment
, and Traversal
records in Parquet format
$ spark-submit \
--class com.github.heuermh.adam.gfa.Gfa1ToDataframes \
target/adam-gfa_2.11-${version}.jar \
in.gfa \
out
(creates separate out.links.parquet
, out.paths.parquet
, out.segments.parquet
, and out.traversals.parquet
directories)
Read and write Graphical Fragment Assembly (GFA) version 2.0
$ spark-submit \
--class com.github.heuermh.adam.gfa.Gfa2 \
target/adam-gfa_2.11-${version}.jar \
in.gfa2 \
out.gfa2
Graphical Fragment Assembly (GFA) version 1.0 schema in Parquet format
Gfa1Record
message spark_schema {
optional binary recordType (UTF8);
optional binary id (UTF8);
optional binary sequence (UTF8);
optional int32 length;
optional int32 readCount;
optional int32 fragmentCount;
optional int32 kmerCount;
optional binary sequenceChecksum (UTF8);
optional binary sequenceUri (UTF8);
optional group source {
optional binary id (UTF8);
optional binary orientation (UTF8);
}
optional group target {
optional binary id (UTF8);
optional binary orientation (UTF8);
}
optional binary overlap (UTF8);
optional int32 mappingQuality;
optional int32 mismatchCount;
optional binary pathName (UTF8);
optional group segments (LIST) {
repeated group list {
optional group element {
optional binary id (UTF8);
optional binary orientation (UTF8);
}
}
}
optional group overlaps (LIST) {
repeated group list {
optional binary element (UTF8);
}
}
optional int32 ordinal;
optional group tags (MAP) {
repeated group key_value {
required binary key (UTF8);
optional group value {
optional binary name (UTF8);
optional binary type (UTF8);
optional binary value (UTF8);
}
}
}
}
Link
message spark_schema {
optional binary id (UTF8);
optional group source {
optional binary id (UTF8);
optional binary orientation (UTF8);
}
optional group target {
optional binary id (UTF8);
optional binary orientation (UTF8);
}
optional binary overlap (UTF8);
optional int32 mappingQuality;
optional int32 mismatchCount;
optional int32 readCount;
optional int32 fragmentCount;
optional int32 kmerCount;
optional group tags (MAP) {
repeated group key_value {
required binary key (UTF8);
optional group value {
optional binary name (UTF8);
optional binary type (UTF8);
optional binary value (UTF8);
}
}
}
}
Path
message spark_schema {
optional binary pathName (UTF8);
optional group segments (LIST) {
repeated group list {
optional group element {
optional binary id (UTF8);
optional binary orientation (UTF8);
}
}
}
optional group overlaps (LIST) {
repeated group list {
optional binary element (UTF8);
}
}
optional group tags (MAP) {
repeated group key_value {
required binary key (UTF8);
optional group value {
optional binary name (UTF8);
optional binary type (UTF8);
optional binary value (UTF8);
}
}
}
}
Segment
message spark_schema {
optional binary id (UTF8);
optional binary sequence (UTF8);
optional int32 length;
optional int32 readCount;
optional int32 fragmentCount;
optional int32 kmerCount;
optional binary sequenceChecksum (UTF8);
optional binary sequenceUri (UTF8);
optional group tags (MAP) {
repeated group key_value {
required binary key (UTF8);
optional group value {
optional binary name (UTF8);
optional binary type (UTF8);
optional binary value (UTF8);
}
}
}
}
Traversal
message spark_schema {
optional binary pathName (UTF8);
optional int32 ordinal;
optional group source {
optional binary id (UTF8);
optional binary orientation (UTF8);
}
optional group target {
optional binary id (UTF8);
optional binary orientation (UTF8);
}
optional binary overlap (UTF8);
optional group tags (MAP) {
repeated group key_value {
required binary key (UTF8);
optional group value {
optional binary name (UTF8);
optional binary type (UTF8);
optional binary value (UTF8);
}
}
}
}