8000 Improve result formatting for variable length pattern and undirected pattern by goungoun · Pull Request #728 · graphframes/graphframes · GitHub
[go: up one dir, main page]

Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
8000
Diff view
40 changes: 29 additions & 11 deletions core/src/main/scala/org/graphframes/GraphFrame.scala
Original file line number Diff line number Diff line change
Expand Up @@ -383,24 +383,42 @@ class GraphFrame private (
s"Unbounded length patten ${pattern} is not supported! " +
"Please a pattern of defined length.")
}
val strToSeq: Seq[String] = (min.toInt to max.toInt).reverse.map { hop =>
s"($src)-[$name*$hop]->($dst)"
val strToSeq: Seq[(Int, String)] = (min.toInt to max.toInt).reverse.map { hop =>
(hop, s"($src)-[$name*$hop]->($dst)")
}
val strToSeqReverse: Seq[String] = if (direction.isEmpty) {
(min.toInt to max.toInt).reverse.map(hop => s"($dst)-[$name*$hop]->($src)")
val strToSeqReverse: Seq[(Int, String)] = if (direction.isEmpty) {
(min.toInt to max.toInt).reverse.map(hop => (hop, s"($src)<-[$name*$hop]-($dst)"))
} else {
Seq.empty[String]
Seq.empty[(Int, String)]
}

(strToSeq ++ strToSeqReverse)
.map(findAugmentedPatterns)
.reduce((a, b) => a.unionByName(b, allowMissingColumns = true))
val out: Seq[DataFrame] = strToSeq.map { case (hop, patternStr) =>
findAugmentedPatterns(patternStr)
.withColumn("_hop", lit(hop))
.withColumn("_pattern", lit(patternStr))
.withColumn("_direction", lit("out"))
}

val in: Seq[DataFrame] = strToSeqReverse.map { case (hop, patternStr) =>
8000 findAugmentedPatterns(patternStr)
.withColumn("_hop", lit(hop))
.withColumn("_pattern", lit(patternStr))
.withColumn("_direction", lit("in"))
}

val ret = (out ++ in).reduce((a, b) => a.unionByName(b, allowMissingColumns = true))
ret.orderBy("_hop", "_direction")

case UndirectedPattern(src, name, dst) =>
val out: DataFrame = findAugmentedPatterns(s"($src)-[$name]->($dst)")
val in: DataFrame = findAugmentedPatterns(s"($dst)-[$name]->($src)")

out.unionByName(in)
.withColumn("_pattern", lit(s"($src)-[$name]->($dst)"))
.withColumn("_direction", lit("out"))
val in: DataFrame = findAugmentedPatterns(s"($src)<-[$name]-($dst)")
.withColumn("_pattern", lit(s"($src)<-[$name]-($dst)"))
.withColumn("_direction", lit("in"))

val ret = out.unionByName(in)
ret.orderBy("_direction")

case _ =>
findAugmentedPatterns(pattern)
Expand Down
10 changes: 6 additions & 4 deletions core/src/main/scala/org/graphframes/pattern/patterns.scala
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,7 @@ private[graphframes] object PatternParser extends RegexParsers {
vertex ~ "-" ~ "[" ~ "[a-zA-Z0-9_]*".r ~ "*" ~ "[0-9]+".r ~ "]& 8000 quot; ~ "->" ~ vertex ^^ {
case src ~ "-" ~ "[" ~ name ~ "*" ~ num ~ "]" ~ "->" ~ dst => {
val hop: Int = num.toInt
if (hop == 1) {
List(if (name.isEmpty) AnonymousEdge(src, dst) else NamedEdge(name, src, dst))
} else if (hop > 1) {
if (hop > 0) {
val midVertices = (1 until hop).map(i => NamedVertex(s"_v$i"))
val vertices = src +: midVertices :+ dst
vertices
Expand Down Expand Up @@ -102,7 +100,11 @@ private[graphframes] object Pattern {
case reversedEdge(negation, dst, edge, src) =>
s"$negation($src)-[$edge]->($dst)"
case bidirectionalEdge(negation, src, edge, dst) =>
s"$negation($src)-[$edge]->($dst);($dst)-[$edge]->($src)"
if (edge.isEmpty || edge.contains("*")) {
s"$negation($src)-[$edge]->($dst);($dst)-[$edge]->($src)"
} else {
s"$negation($src)-[${edge}1]->($dst);($dst)-[${edge}2]->($src)"
}
case original => original
}
}
Expand Down
9 changes: 8 additions & 1 deletion core/src/test/scala/org/graphframes/PatternMatchSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -594,6 +594,7 @@ class PatternMatchSuite extends SparkFunSuite with GraphFrameTestSparkContext {
val varEdge = g
.find("(u)-[*2..2]->(v)")
.where("u.id == 0")
.drop("_hop", "_pattern", "_direction")

val fixedEdge = g
.find("(u)-[*2]->(v)")
Expand All @@ -607,6 +608,7 @@ class PatternMatchSuite extends SparkFunSuite with GraphFrameTestSparkContext {
val varEdge = g
.find("(u)-[*2..3]->(v)")
.where("u.id == 0")
.drop("_hop", "_pattern", "_direction")

val fixedEdge2 = g
.find("(u)-[*2]->(v)")
Expand All @@ -628,7 +630,8 @@ class PatternMatchSuite extends SparkFunSuite with GraphFrameTestSparkContext {
.find("(u)-[e*2..3]->(v)")
.where("u.id == 0")

val expectedCols = Seq("u", "_e1", "_v1", "_e2", "_v2", "_e3", "v")
val expectedCols =
Seq("u", "_e1", "_v1", "_e2", "_v2", "_e3", "v", "_hop", "_pattern", "_direction")

assert(varEdge.schema.map(_.name) == expectedCols)
}
Expand All @@ -637,6 +640,7 @@ class PatternMatchSuite extends SparkFunSuite with GraphFrameTestSparkContext {
val varEdge = g
.find("(u)-[*3..5]->(v)")
.where("u.id == 0")
.drop("_hop", "_pattern", "_direction")

val fixedEdge3 = g
.find("(u)-[*3]->(v)")
Expand Down Expand Up @@ -688,14 +692,17 @@ class PatternMatchSuite extends SparkFunSuite with GraphFrameTestSparkContext {
val res = g
.find("(u)-[e*1..3]-(v)")
.where("u.id == 2")
.drop("_pattern", "_direction")

val df1 = g
.find("(u)-[e*1..3]->(v)")
.where("u.id == 2")
.drop("_pattern", "_direction")

val df2 = g
.find("(v)-[e*1..3]->(u)")
.where("u.id == 2")
.drop("_pattern", "_direction")

val expected = df1.unionByName(df2, allowMissingColumns = true)

Expand Down
13 changes: 12 additions & 1 deletion core/src/test/scala/org/graphframes/pattern/PatternSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,17 @@ class PatternSuite extends SparkFunSuite {
Pattern.parse("(u)-[e]->(v)") ===
Seq(NamedEdge("e", NamedVertex("u"), NamedVertex("v"))))

assert(
Pattern.parse("(u)-[e*1]->(v)") ===
Seq(NamedEdge("_e1", NamedVertex("u"), NamedVertex("v"))))

assert(
Pattern.parse("(u)-[e*3]->(v)") ===
Seq(
NamedEdge("_e1", NamedVertex("u"), NamedVertex("_v1")),
NamedEdge("_e2", NamedVertex("_v1"), NamedVertex("_v2")),
NamedEdge("_e3", NamedVertex("_v2"), NamedVertex("v"))))

assert(
Pattern.parse("()-[]->(v)") ===
Seq(AnonymousEdge(AnonymousVertex, NamedVertex("v"))))
Expand Down Expand Up @@ -90,7 +101,7 @@ class PatternSuite extends SparkFunSuite {
assert(
Pattern.rewriteIncomingEdges("(u)<-[]-(v);(u)-[e]->(v)") === "(v)-[]->(u);(u)-[e]->(v)")
assert(Pattern.rewriteIncomingEdges("(u)<-[]->(v)") === "(u)-[]->(v);(v)-[]->(u)")
assert(Pattern.rewriteIncomingEdges("(u)<-[e]->(v)") === "(u)-[e]->(v);(v)-[e]->(u)")
assert(Pattern.rewriteIncomingEdges("(u)<-[e]->(v)") === "(u)-[e1]->(v);(v)-[e2]->(u)")
assert(Pattern.rewriteIncomingEdges("(u)<-[*5]-(v)") === "(v)-[*5]->(u)")
assert(Pattern.rewriteIncomingEdges("(u)<-[*5]->(v)") === "(u)-[*5]->(v);(v)-[*5]->(u)")
assert(
Expand Down
26 changes: 14 additions & 12 deletions docs/src/04-user-guide/04-motif-finding.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,25 @@

Motif finding refers to searching for structural patterns in a graph. For an example of real-world use, check out the [Motif Finding Tutorial](/03-tutorials/02-motif-tutorial.md).

GraphFrame motif finding uses a simple Domain-Specific Language (DSL) for expressing structural queries. For example, `graph.find("(a)-[e]->(b); (b)-[e2]->(a)")` will search for pairs of vertices `a,b` connected by edges in both directions. It will return a `DataFrame` of all such structures in the graph, with columns for each of the named elements (vertices or edges) in the motif. In this case, the returned columns will be "a, b, e, e2."
GraphFrame motif finding uses a simple Domain-Specific Language (DSL) for expressing structural queries. For example, `graph.find("(a)-[e1]->(b); (b)-[e2]->(a)")` or `graph.find("(a)<-[e]->(b)")` will search for pairs of vertices `a`,`b` connected by edges in both directions. It will return a `DataFrame` of all such structures in the graph, with columns for each of the named elements (vertices or edges) in the motif. In this case, the returned columns will be "a, b, e1, e2."

DSL for expressing structural patterns:

* The basic unit of a pattern is an edge.
For example, `"(a)-[e]->(b)"` expresses an edge `e` from vertex `a` to vertex `b`.
Note that vertices are denoted by parentheses `(a)`, while edges are denoted by
square brackets `[e]`.
* A pattern is expressed as a union of edges. Edge patterns can be joined with semicolons.
Motif `"(a)-[e]->(b); (b)-[e2]->(c)"` specifies two edges from `a` to `b` to `c`.
* A pattern is expressed as a join of edges. Edge patterns can be joined with semicolons.
Motif `"(a)-[e1]->(b); (b)-[e2]->(c)"` specifies two edges from `a` to `b` to `c`.
* Simply, you can also quantify the fixed length like `"(a)-[e*2]->(c)"`. The motif parser decompose it into multiple patterns `"(a)-[e1]->(_v1);(_v1)-[e1]->(c)"` by inserting interim vertexes arbitrarily. It specifies two edges from `a` to `_v1` to `c`.
* In order to search for variable-length motifs, you can specify the range `"(a)-[e*1..3]->(c)"`. It unions the results from each possible length `"(a)-[e*1]->(c)"`, `"(a)-[e*2]->(c)"`, and `"(a)-[e*3]->(c)"` into a DataFrame.
* If the direction is omitted `"(a)-[e]-(b)"`, it represents an undirected pattern — that is, either `"(a)-[e]->(b)"` or `"(a)<-[e]-(b)"`, which includes edges that are incoming or outgoing.
* Within a pattern, names can be assigned to vertices and edges. For example,
`"(a)-[e]->(b)"` has three named elements: vertices `a,b` and edge `e`.
These names serve two purposes:
* The names can identify common elements among edges. For example,
`"(a)-[e]->(b); (b)-[e2]->(c)"` specifies that the same vertex `b` is the destination
of edge `e` and source of edge `e2`.
`"(a)-[e1]->(b); (b)-[e2]->(c)"` specifies that the same vertex `b` is the destination
of edge `e1` and source of edge `e2`.
* The names are used as column names in the result `DataFrame`. If a motif contains
named vertex `a`, then the result `DataFrame` will contain a column "a" which is a
`StructType` with sub-fields equivalent to the schema (columns) of
Expand All @@ -26,7 +29,7 @@ DSL for expressing structural patterns:
`GraphFrame.edges`.
* Be aware that names do *not* identify *distinct* elements: two elements with different
names may refer to the same graph element. For example, in the motif
`"(a)-[e]->(b); (b)-[e2]->(c)"`, the names `a` and `c` could refer to the same vertex.
`"(a)-[e1]->(b); (b)-[e2]->(c)"`, the names `a` and `c` could refer to the same vertex.
To restrict named elements to be distinct vertices or edges, use post-hoc filters
such as `resultDataframe.filter("a.id != c.id")`.
* It is acceptable to omit names for vertices or edges in motifs when not needed.
Expand All @@ -42,6 +45,9 @@ Restrictions:

* Motifs are not allowed to contain edges without any named elements: `"()-[]->()"` and `"!()-[]->()"` are prohibited terms.
* Motifs are not allowed to contain named edges within negated terms (since these named edges would never appear within results). E.g., `"!(a)-[ab]->(b)"` is invalid, but `"!(a)-[]->(b)"` is valid.
* Negation is not supported for the variable-length pattern and undirected pattern: `"!(a)-[*1..3]->(b)"` and `"!(a)-[]-(b)"` are not allowed.
* Unbounded length patten is not supported: `"(a)-[*..3]->(b)"` and `"(a)-[*1..]->(b)"` are not allowed.
* You cannot join additional edges with the variable length pattern: `"(a)-[*1..3]-(b);(b)-[]-(c)"`is not valid.

More complex queries, such as queries which operate on vertex or edge attributes,
can be expressed by applying filters to the result `DataFrame`.
Expand All @@ -59,7 +65,7 @@ from graphframes.examples import Graphs
g = Graphs(spark).friends() # Get example graph

# Search for pairs of vertices with edges in both directions between them
motifs = g.find("(a)-[e]->(b); (b)-[e2]->(a)")
motifs = g.find("(a)-[e1]->(b); (b)-[e2]->(a)")
motifs.show()

# More complex queries can be expressed by applying filters
Expand All @@ -77,7 +83,7 @@ import org.graphframes.{examples,GraphFrame}
val g: GraphFrame = examples.Graphs.friends // get example graph

// Search for pairs of vertices with edges in both directions between them.
val motifs: DataFrame = g.find("(a)-[e]->(b); (b)-[e2]->(a)")
val motifs: DataFrame = g.find("(a)-[e1]->(b); (b)-[e2]->(a)")
motifs.show()

// More complex queries can be expressed by applying filters.
Expand Down Expand Up @@ -154,7 +160,3 @@ val condition = { Seq("ab", "bc", "cd")
val chainWith2Friends2 = chain4.where(condition >= 2)
chainWith2Friends2.show()
```

### Conclusion

The above example demonstrated a stateful motif for a fixed-length chain. Currently, in order to search for variable-length motifs, users need to run one query for each possible length. However, the above query patterns allow users to re-use the same code for each length, with the only change being to update the sequence of motif elements ("ab", "bc", "cd" above).
0