diff --git a/core/src/main/scala/org/graphframes/GraphFrame.scala b/core/src/main/scala/org/graphframes/GraphFrame.scala index da59a74f8..af2a1ae1e 100644 --- a/core/src/main/scala/org/graphframes/GraphFrame.scala +++ b/core/src/main/scala/org/graphframes/GraphFrame.scala @@ -383,24 +383,42 @@ class GraphFrame private ( s"Unbounded length patten ${pattern} is not supported! " + "Please a pattern of defined length.") } - val strToSeq: Seq[String] = (min.toInt to max.toInt).reverse.map { hop => - s"($src)-[$name*$hop]->($dst)" + val strToSeq: Seq[(Int, String)] = (min.toInt to max.toInt).reverse.map { hop => + (hop, s"($src)-[$name*$hop]->($dst)") } - val strToSeqReverse: Seq[String] = if (direction.isEmpty) { - (min.toInt to max.toInt).reverse.map(hop => s"($dst)-[$name*$hop]->($src)") + val strToSeqReverse: Seq[(Int, String)] = if (direction.isEmpty) { + (min.toInt to max.toInt).reverse.map(hop => (hop, s"($src)<-[$name*$hop]-($dst)")) } else { - Seq.empty[String] + Seq.empty[(Int, String)] } - (strToSeq ++ strToSeqReverse) - .map(findAugmentedPatterns) - .reduce((a, b) => a.unionByName(b, allowMissingColumns = true)) + val out: Seq[DataFrame] = strToSeq.map { case (hop, patternStr) => + findAugmentedPatterns(patternStr) + .withColumn("_hop", lit(hop)) + .withColumn("_pattern", lit(patternStr)) + .withColumn("_direction", lit("out")) + } + + val in: Seq[DataFrame] = strToSeqReverse.map { case (hop, patternStr) => + findAugmentedPatterns(patternStr) + .withColumn("_hop", lit(hop)) + .withColumn("_pattern", lit(patternStr)) + .withColumn("_direction", lit("in")) + } + + val ret = (out ++ in).reduce((a, b) => a.unionByName(b, allowMissingColumns = true)) + ret.orderBy("_hop", "_direction") case UndirectedPattern(src, name, dst) => val out: DataFrame = findAugmentedPatterns(s"($src)-[$name]->($dst)") - val in: DataFrame = findAugmentedPatterns(s"($dst)-[$name]->($src)") - - out.unionByName(in) + .withColumn("_pattern", lit(s"($src)-[$name]->($dst)")) + .withColumn("_direction", lit("out")) + val in: DataFrame = findAugmentedPatterns(s"($src)<-[$name]-($dst)") + .withColumn("_pattern", lit(s"($src)<-[$name]-($dst)")) + .withColumn("_direction", lit("in")) + + val ret = out.unionByName(in) + ret.orderBy("_direction") case _ => findAugmentedPatterns(pattern) diff --git a/core/src/main/scala/org/graphframes/pattern/patterns.scala b/core/src/main/scala/org/graphframes/pattern/patterns.scala index 022b398fe..92419432d 100644 --- a/core/src/main/scala/org/graphframes/pattern/patterns.scala +++ b/core/src/main/scala/org/graphframes/pattern/patterns.scala @@ -49,9 +49,7 @@ private[graphframes] object PatternParser extends RegexParsers { vertex ~ "-" ~ "[" ~ "[a-zA-Z0-9_]*".r ~ "*" ~ "[0-9]+".r ~ "]" ~ "->" ~ vertex ^^ { case src ~ "-" ~ "[" ~ name ~ "*" ~ num ~ "]" ~ "->" ~ dst => { val hop: Int = num.toInt - if (hop == 1) { - List(if (name.isEmpty) AnonymousEdge(src, dst) else NamedEdge(name, src, dst)) - } else if (hop > 1) { + if (hop > 0) { val midVertices = (1 until hop).map(i => NamedVertex(s"_v$i")) val vertices = src +: midVertices :+ dst vertices @@ -102,7 +100,11 @@ private[graphframes] object Pattern { case reversedEdge(negation, dst, edge, src) => s"$negation($src)-[$edge]->($dst)" case bidirectionalEdge(negation, src, edge, dst) => - s"$negation($src)-[$edge]->($dst);($dst)-[$edge]->($src)" + if (edge.isEmpty || edge.contains("*")) { + s"$negation($src)-[$edge]->($dst);($dst)-[$edge]->($src)" + } else { + s"$negation($src)-[${edge}1]->($dst);($dst)-[${edge}2]->($src)" + } case original => original } } diff --git a/core/src/test/scala/org/graphframes/PatternMatchSuite.scala b/core/src/test/scala/org/graphframes/PatternMatchSuite.scala index 097c76fd3..5a91c4fe7 100644 --- a/core/src/test/scala/org/graphframes/PatternMatchSuite.scala +++ b/core/src/test/scala/org/graphframes/PatternMatchSuite.scala @@ -594,6 +594,7 @@ class PatternMatchSuite extends SparkFunSuite with GraphFrameTestSparkContext { val varEdge = g .find("(u)-[*2..2]->(v)") .where("u.id == 0") + .drop("_hop", "_pattern", "_direction") val fixedEdge = g .find("(u)-[*2]->(v)") @@ -607,6 +608,7 @@ class PatternMatchSuite extends SparkFunSuite with GraphFrameTestSparkContext { val varEdge = g .find("(u)-[*2..3]->(v)") .where("u.id == 0") + .drop("_hop", "_pattern", "_direction") val fixedEdge2 = g .find("(u)-[*2]->(v)") @@ -628,7 +630,8 @@ class PatternMatchSuite extends SparkFunSuite with GraphFrameTestSparkContext { .find("(u)-[e*2..3]->(v)") .where("u.id == 0") - val expectedCols = Seq("u", "_e1", "_v1", "_e2", "_v2", "_e3", "v") + val expectedCols = + Seq("u", "_e1", "_v1", "_e2", "_v2", "_e3", "v", "_hop", "_pattern", "_direction") assert(varEdge.schema.map(_.name) == expectedCols) } @@ -637,6 +640,7 @@ class PatternMatchSuite extends SparkFunSuite with GraphFrameTestSparkContext { val varEdge = g .find("(u)-[*3..5]->(v)") .where("u.id == 0") + .drop("_hop", "_pattern", "_direction") val fixedEdge3 = g .find("(u)-[*3]->(v)") @@ -688,14 +692,17 @@ class PatternMatchSuite extends SparkFunSuite with GraphFrameTestSparkContext { val res = g .find("(u)-[e*1..3]-(v)") .where("u.id == 2") + .drop("_pattern", "_direction") val df1 = g .find("(u)-[e*1..3]->(v)") .where("u.id == 2") + .drop("_pattern", "_direction") val df2 = g .find("(v)-[e*1..3]->(u)") .where("u.id == 2") + .drop("_pattern", "_direction") val expected = df1.unionByName(df2, allowMissingColumns = true) diff --git a/core/src/test/scala/org/graphframes/pattern/PatternSuite.scala b/core/src/test/scala/org/graphframes/pattern/PatternSuite.scala index deee118a9..d6eefbe7d 100644 --- a/core/src/test/scala/org/graphframes/pattern/PatternSuite.scala +++ b/core/src/test/scala/org/graphframes/pattern/PatternSuite.scala @@ -30,6 +30,17 @@ class PatternSuite extends SparkFunSuite { Pattern.parse("(u)-[e]->(v)") === Seq(NamedEdge("e", NamedVertex("u"), NamedVertex("v")))) + assert( + Pattern.parse("(u)-[e*1]->(v)") === + Seq(NamedEdge("_e1", NamedVertex("u"), NamedVertex("v")))) + + assert( + Pattern.parse("(u)-[e*3]->(v)") === + Seq( + NamedEdge("_e1", NamedVertex("u"), NamedVertex("_v1")), + NamedEdge("_e2", NamedVertex("_v1"), NamedVertex("_v2")), + NamedEdge("_e3", NamedVertex("_v2"), NamedVertex("v")))) + assert( Pattern.parse("()-[]->(v)") === Seq(AnonymousEdge(AnonymousVertex, NamedVertex("v")))) @@ -90,7 +101,7 @@ class PatternSuite extends SparkFunSuite { assert( Pattern.rewriteIncomingEdges("(u)<-[]-(v);(u)-[e]->(v)") === "(v)-[]->(u);(u)-[e]->(v)") assert(Pattern.rewriteIncomingEdges("(u)<-[]->(v)") === "(u)-[]->(v);(v)-[]->(u)") - assert(Pattern.rewriteIncomingEdges("(u)<-[e]->(v)") === "(u)-[e]->(v);(v)-[e]->(u)") + assert(Pattern.rewriteIncomingEdges("(u)<-[e]->(v)") === "(u)-[e1]->(v);(v)-[e2]->(u)") assert(Pattern.rewriteIncomingEdges("(u)<-[*5]-(v)") === "(v)-[*5]->(u)") assert(Pattern.rewriteIncomingEdges("(u)<-[*5]->(v)") === "(u)-[*5]->(v);(v)-[*5]->(u)") assert( diff --git a/docs/src/04-user-guide/04-motif-finding.md b/docs/src/04-user-guide/04-motif-finding.md index dd39222a6..1c5fe142a 100644 --- a/docs/src/04-user-guide/04-motif-finding.md +++ b/docs/src/04-user-guide/04-motif-finding.md @@ -2,7 +2,7 @@ Motif finding refers to searching for structural patterns in a graph. For an example of real-world use, check out the [Motif Finding Tutorial](/03-tutorials/02-motif-tutorial.md). -GraphFrame motif finding uses a simple Domain-Specific Language (DSL) for expressing structural queries. For example, `graph.find("(a)-[e]->(b); (b)-[e2]->(a)")` will search for pairs of vertices `a,b` connected by edges in both directions. It will return a `DataFrame` of all such structures in the graph, with columns for each of the named elements (vertices or edges) in the motif. In this case, the returned columns will be "a, b, e, e2." +GraphFrame motif finding uses a simple Domain-Specific Language (DSL) for expressing structural queries. For example, `graph.find("(a)-[e1]->(b); (b)-[e2]->(a)")` or `graph.find("(a)<-[e]->(b)")` will search for pairs of vertices `a`,`b` connected by edges in both directions. It will return a `DataFrame` of all such structures in the graph, with columns for each of the named elements (vertices or edges) in the motif. In this case, the returned columns will be "a, b, e1, e2." DSL for expressing structural patterns: @@ -10,14 +10,17 @@ DSL for expressing structural patterns: For example, `"(a)-[e]->(b)"` expresses an edge `e` from vertex `a` to vertex `b`. Note that vertices are denoted by parentheses `(a)`, while edges are denoted by square brackets `[e]`. -* A pattern is expressed as a union of edges. Edge patterns can be joined with semicolons. - Motif `"(a)-[e]->(b); (b)-[e2]->(c)"` specifies two edges from `a` to `b` to `c`. +* A pattern is expressed as a join of edges. Edge patterns can be joined with semicolons. + Motif `"(a)-[e1]->(b); (b)-[e2]->(c)"` specifies two edges from `a` to `b` to `c`. +* Simply, you can also quantify the fixed length like `"(a)-[e*2]->(c)"`. The motif parser decompose it into multiple patterns `"(a)-[e1]->(_v1);(_v1)-[e1]->(c)"` by inserting interim vertexes arbitrarily. It specifies two edges from `a` to `_v1` to `c`. +* In order to search for variable-length motifs, you can specify the range `"(a)-[e*1..3]->(c)"`. It unions the results from each possible length `"(a)-[e*1]->(c)"`, `"(a)-[e*2]->(c)"`, and `"(a)-[e*3]->(c)"` into a DataFrame. +* If the direction is omitted `"(a)-[e]-(b)"`, it represents an undirected pattern — that is, either `"(a)-[e]->(b)"` or `"(a)<-[e]-(b)"`, which includes edges that are incoming or outgoing. * Within a pattern, names can be assigned to vertices and edges. For example, `"(a)-[e]->(b)"` has three named elements: vertices `a,b` and edge `e`. These names serve two purposes: * The names can identify common elements among edges. For example, - `"(a)-[e]->(b); (b)-[e2]->(c)"` specifies that the same vertex `b` is the destination - of edge `e` and source of edge `e2`. + `"(a)-[e1]->(b); (b)-[e2]->(c)"` specifies that the same vertex `b` is the destination + of edge `e1` and source of edge `e2`. * The names are used as column names in the result `DataFrame`. If a motif contains named vertex `a`, then the result `DataFrame` will contain a column "a" which is a `StructType` with sub-fields equivalent to the schema (columns) of @@ -26,7 +29,7 @@ DSL for expressing structural patterns: `GraphFrame.edges`. * Be aware that names do *not* identify *distinct* elements: two elements with different names may refer to the same graph element. For example, in the motif - `"(a)-[e]->(b); (b)-[e2]->(c)"`, the names `a` and `c` could refer to the same vertex. + `"(a)-[e1]->(b); (b)-[e2]->(c)"`, the names `a` and `c` could refer to the same vertex. To restrict named elements to be distinct vertices or edges, use post-hoc filters such as `resultDataframe.filter("a.id != c.id")`. * It is acceptable to omit names for vertices or edges in motifs when not needed. @@ -42,6 +45,9 @@ Restrictions: * Motifs are not allowed to contain edges without any named elements: `"()-[]->()"` and `"!()-[]->()"` are prohibited terms. * Motifs are not allowed to contain named edges within negated terms (since these named edges would never appear within results). E.g., `"!(a)-[ab]->(b)"` is invalid, but `"!(a)-[]->(b)"` is valid. +* Negation is not supported for the variable-length pattern and undirected pattern: `"!(a)-[*1..3]->(b)"` and `"!(a)-[]-(b)"` are not allowed. +* Unbounded length patten is not supported: `"(a)-[*..3]->(b)"` and `"(a)-[*1..]->(b)"` are not allowed. +* You cannot join additional edges with the variable length pattern: `"(a)-[*1..3]-(b);(b)-[]-(c)"`is not valid. More complex queries, such as queries which operate on vertex or edge attributes, can be expressed by applying filters to the result `DataFrame`. @@ -59,7 +65,7 @@ from graphframes.examples import Graphs g = Graphs(spark).friends() # Get example graph # Search for pairs of vertices with edges in both directions between them -motifs = g.find("(a)-[e]->(b); (b)-[e2]->(a)") +motifs = g.find("(a)-[e1]->(b); (b)-[e2]->(a)") motifs.show() # More complex queries can be expressed by applying filters @@ -77,7 +83,7 @@ import org.graphframes.{examples,GraphFrame} val g: GraphFrame = examples.Graphs.friends // get example graph // Search for pairs of vertices with edges in both directions between them. -val motifs: DataFrame = g.find("(a)-[e]->(b); (b)-[e2]->(a)") +val motifs: DataFrame = g.find("(a)-[e1]->(b); (b)-[e2]->(a)") motifs.show() // More complex queries can be expressed by applying filters. @@ -154,7 +160,3 @@ val condition = { Seq("ab", "bc", "cd") val chainWith2Friends2 = chain4.where(condition >= 2) chainWith2Friends2.show() ``` - -### Conclusion - -The above example demonstrated a stateful motif for a fixed-length chain. Currently, in order to search for variable-length motifs, users need to run one query for each possible length. However, the above query patterns allow users to re-use the same code for each length, with the only change being to update the sequence of motif elements ("ab", "bc", "cd" above).