Skip to content

Commit 41edcc0

Browse files
author
Hongjiang Zhang
committed
[TINKERPOP-3133] Allow customize the output partition
set the repartition number by, for example, 'gremlin.spark.outputRepartition=500'. only integer values larger than 0 is valid input. Otherwise the repartition will be skipped silently.
1 parent bdba436 commit 41edcc0

File tree

3 files changed

+38
-7
lines changed

3 files changed

+38
-7
lines changed

hadoop-gremlin/src/main/java/org/apache/tinkerpop/gremlin/hadoop/Constants.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ private Constants() {
7474
public static final String GREMLIN_SPARK_SKIP_PARTITIONER = "gremlin.spark.skipPartitioner"; // don't partition the loadedGraphRDD
7575
public static final String GREMLIN_SPARK_SKIP_GRAPH_CACHE = "gremlin.spark.skipGraphCache"; // don't cache the loadedGraphRDD (ignores graphStorageLevel)
7676
public static final String GREMLIN_SPARK_DONT_DELETE_NON_EMPTY_OUTPUT = "gremlin.spark.dontDeleteNonEmptyOutput"; // don't delete the output if it is not empty
77+
public static final String GREMLIN_SPARK_OUTPUT_REPARTITION = "gremlin.spark.outputRepartition"; // allow set the repartition number of the outputRDD to reduce HDFS small files
7778
public static final String SPARK_SERIALIZER = "spark.serializer";
7879
public static final String SPARK_KRYO_REGISTRATOR = "spark.kryo.registrator";
7980
public static final String SPARK_KRYO_REGISTRATION_REQUIRED = "spark.kryo.registrationRequired";

spark-gremlin/src/main/java/org/apache/tinkerpop/gremlin/spark/structure/io/OutputFormatRDD.java

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,8 @@ public void writeGraphRDD(final Configuration configuration, final JavaPairRDD<O
4848
final String outputLocation = hadoopConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION);
4949
if (null != outputLocation) {
5050
// map back to a <nullwritable,vertexwritable> stream for output
51-
graphRDD.mapToPair(tuple -> new Tuple2<>(NullWritable.get(), tuple._2()))
51+
JavaPairRDD<Object, VertexWritable> javaPairRDD = repartitionJavaPairRDD(hadoopConfiguration, graphRDD);
52+
javaPairRDD.mapToPair(tuple -> new Tuple2<>(NullWritable.get(), tuple._2()))
5253
.saveAsNewAPIHadoopFile(Constants.getGraphLocation(outputLocation),
5354
NullWritable.class,
5455
VertexWritable.class,
@@ -62,7 +63,8 @@ public <K, V> Iterator<KeyValue<K, V>> writeMemoryRDD(final Configuration config
6263
final String outputLocation = hadoopConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION);
6364
if (null != outputLocation) {
6465
// map back to a Hadoop stream for output
65-
memoryRDD.mapToPair(keyValue -> new Tuple2<>(new ObjectWritable<>(keyValue._1()), new ObjectWritable<>(keyValue._2())))
66+
JavaPairRDD<K, V> javaPairRDD = repartitionJavaPairRDD(hadoopConfiguration, memoryRDD);
67+
javaPairRDD.mapToPair(keyValue -> new Tuple2<>(new ObjectWritable<>(keyValue._1()), new ObjectWritable<>(keyValue._2())))
6668
.saveAsNewAPIHadoopFile(Constants.getMemoryLocation(outputLocation, memoryKey),
6769
ObjectWritable.class,
6870
ObjectWritable.class,
@@ -75,4 +77,17 @@ public <K, V> Iterator<KeyValue<K, V>> writeMemoryRDD(final Configuration config
7577
}
7678
return Collections.emptyIterator();
7779
}
78-
}
80+
81+
/**
82+
* Allow users to customize the RDD partitions to reduce HDFS small files
83+
*/
84+
private static <K, V> JavaPairRDD<K, V> repartitionJavaPairRDD(final org.apache.hadoop.conf.Configuration hadoopConfiguration, JavaPairRDD<K, V> graphRDD) {
85+
JavaPairRDD<K, V> javaPairRDD = graphRDD;
86+
final String repartitionString = hadoopConfiguration.get(Constants.GREMLIN_SPARK_OUTPUT_REPARTITION);
87+
final int repartition = null == repartitionString ? -1 : Integer.parseInt(repartitionString);
88+
if (repartition > 0) {
89+
javaPairRDD = javaPairRDD.repartition(repartition);
90+
}
91+
return javaPairRDD;
92+
}
93+
}

spark-gremlin/src/main/java/org/apache/tinkerpop/gremlin/spark/structure/io/PersistedOutputRDD.java

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,15 +51,16 @@ public void writeGraphRDD(final Configuration configuration, final JavaPairRDD<O
5151
SparkContextStorage.open(configuration).rm(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION)); // this might be bad cause it unpersists the job RDD
5252
// determine which storage level to persist the RDD as with MEMORY_ONLY being the default cache()
5353
final StorageLevel storageLevel = StorageLevel.fromString(configuration.getString(Constants.GREMLIN_SPARK_PERSIST_STORAGE_LEVEL, "MEMORY_ONLY"));
54+
final JavaPairRDD<Object, VertexWritable> javaPairRDD = repartitionJavaPairRDD(configuration, graphRDD);
5455
if (!configuration.getBoolean(Constants.GREMLIN_HADOOP_GRAPH_WRITER_HAS_EDGES, true))
55-
graphRDD.mapValues(vertex -> {
56+
javaPairRDD.mapValues(vertex -> {
5657
vertex.get().dropEdges(Direction.BOTH);
5758
return vertex;
5859
}).setName(Constants.getGraphLocation(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))).persist(storageLevel)
5960
// call action to eager store rdd
6061
.count();
6162
else
62-
graphRDD.setName(Constants.getGraphLocation(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))).persist(storageLevel)
63+
javaPairRDD.setName(Constants.getGraphLocation(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))).persist(storageLevel)
6364
// call action to eager store rdd
6465
.count();
6566
Spark.refresh(); // necessary to do really fast so the Spark GC doesn't clear out the RDD
@@ -73,15 +74,29 @@ public <K, V> Iterator<KeyValue<K, V>> writeMemoryRDD(final Configuration config
7374
throw new IllegalArgumentException("There is no provided " + Constants.GREMLIN_HADOOP_OUTPUT_LOCATION + " to write the persisted RDD to");
7475
final String memoryRDDName = Constants.getMemoryLocation(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION), memoryKey);
7576
Spark.removeRDD(memoryRDDName);
76-
memoryRDD.setName(memoryRDDName).persist(StorageLevel.fromString(configuration.getString(Constants.GREMLIN_SPARK_PERSIST_STORAGE_LEVEL, "MEMORY_ONLY")))
77+
final JavaPairRDD<K, V> javaPairRDD = repartitionJavaPairRDD(configuration, memoryRDD);
78+
javaPairRDD.setName(memoryRDDName).persist(StorageLevel.fromString(configuration.getString(Constants.GREMLIN_SPARK_PERSIST_STORAGE_LEVEL, "MEMORY_ONLY")))
7779
// call action to eager store rdd
7880
.count();
7981
Spark.refresh(); // necessary to do really fast so the Spark GC doesn't clear out the RDD
80-
return IteratorUtils.map(memoryRDD.collect().iterator(), tuple -> new KeyValue<>(tuple._1(), tuple._2()));
82+
return IteratorUtils.map(javaPairRDD.collect().iterator(), tuple -> new KeyValue<>(tuple._1(), tuple._2()));
8183
}
8284

8385
@Override
8486
public boolean supportsResultGraphPersistCombination(final GraphComputer.ResultGraph resultGraph, final GraphComputer.Persist persist) {
8587
return persist.equals(GraphComputer.Persist.NOTHING) || resultGraph.equals(GraphComputer.ResultGraph.NEW);
8688
}
89+
90+
/**
91+
* Allow users to customize the RDD partitions to reduce HDFS small files
92+
*/
93+
private static <K, V> JavaPairRDD<K, V> repartitionJavaPairRDD(final Configuration configuration, JavaPairRDD<K, V> graphRDD) {
94+
JavaPairRDD<K, V> javaPairRDD = graphRDD;
95+
final String repartitionString = configuration.getString(Constants.GREMLIN_SPARK_OUTPUT_REPARTITION);
96+
final int repartition = null == repartitionString ? -1 : Integer.parseInt(repartitionString);
97+
if (repartition > 0) {
98+
javaPairRDD = javaPairRDD.repartition(repartition);
99+
}
100+
return javaPairRDD;
101+
}
87102
}

0 commit comments

Comments
 (0)