add support of elasticsearch pipeline (#224)

yinzhigang · siddontang · commit 772331f0dddb · 2018-03-23T09:01:58.000+08:00
diff --git a/README.md b/README.md
@@ -174,6 +174,21 @@ You can ignore these tables in the configuration like:
 skip_no_pk_table = true
 ```
 
+## Elasticsearch Pipeline
+You can use [Ingest Node Pipeline](https://www.elastic.co/guide/en/elasticsearch/reference/current/ingest.html) to pre-process documents before indexing, like JSON string decode, merge fileds and more.
+
+```
+[[rule]]
+schema = "test"
+table = "t1"
+index = "t"
+type = "_doc"
+
+# pipeline id
+pipeline = "my-pipeline-id"
+```
+Node: you should [create pipeline](https://www.elastic.co/guide/en/elasticsearch/reference/current/put-pipeline-api.html) manually and Elasticsearch >= 5.0.
+
 ## Why not other rivers?
 
 Although there are some other MySQL rivers for Elasticsearch, like [elasticsearch-river-jdbc](https://github.com/jprante/elasticsearch-river-jdbc), [elasticsearch-river-mysql](https://github.com/scharron/elasticsearch-river-mysql), I still want to build a new one with Go, why?
diff --git a/elastic/client.go b/elastic/client.go
@@ -79,6 +79,7 @@ type BulkRequest struct {
 	Type   string
 	ID     string
 	Parent string
+	Pipeline string
 
 	Data map[string]interface{}
 }
@@ -99,6 +100,9 @@ func (r *BulkRequest) bulk(buf *bytes.Buffer) error {
 	if len(r.Parent) > 0 {
 		metaData["_parent"] = r.Parent
 	}
+	if len(r.Pipeline) > 0 {
+		metaData["pipeline"] = r.Pipeline
+	}
 
 	meta[r.Action] = metaData
 
diff --git a/river/rule.go b/river/rule.go
@@ -27,6 +27,10 @@ type Rule struct {
 
 	//only MySQL fields in filter will be synced , default sync all fields
 	Filter []string `toml:"filter"`
+
+	// Elasticsearch pipeline
+	// To pre-process documents before indexing
+	Pipeline string `toml:"pipeline"`
 }
 
 func newDefaultRule(schema string, table string) *Rule {
diff --git a/river/sync.go b/river/sync.go
@@ -183,7 +183,7 @@ func (r *River) makeRequest(rule *Rule, action string, rows [][]interface{}) ([]
 			}
 		}
 
-		req := &elastic.BulkRequest{Index: rule.Index, Type: rule.Type, ID: id, Parent: parentID}
+		req := &elastic.BulkRequest{Index: rule.Index, Type: rule.Type, ID: id, Parent: parentID, Pipeline: rule.Pipeline}
 
 		if action == canal.DeleteAction {
 			req.Action = elastic.ActionDelete
@@ -242,13 +242,21 @@ func (r *River) makeUpdateRequest(rule *Rule, rows [][]interface{}) ([]*elastic.
 			req.Action = elastic.ActionDelete
 			reqs = append(reqs, req)
 
-			req = &elastic.BulkRequest{Index: rule.Index, Type: rule.Type, ID: afterID, Parent: afterParentID}
+			req = &elastic.BulkRequest{Index: rule.Index, Type: rule.Type, ID: afterID, Parent: afterParentID, Pipeline: rule.Pipeline}
 			r.makeInsertReqData(req, rule, rows[i+1])
 
 			r.st.DeleteNum.Add(1)
 			r.st.InsertNum.Add(1)
 		} else {
-			r.makeUpdateReqData(req, rule, rows[i], rows[i+1])
+			if len(rule.Pipeline) > 0 {
+				// Pipelines can only be specified on index action
+				r.makeInsertReqData(req, rule, rows[i+1])
+				// Make sure action is index, not create
+				req.Action = elastic.ActionIndex
+				req.Pipeline = rule.Pipeline
+			} else {
+				r.makeUpdateReqData(req, rule, rows[i], rows[i+1])
+			}
 			r.st.UpdateNum.Add(1)
 		}
 

Original file line number	Diff line number	Diff line change
`@@ -79,6 +79,7 @@ type BulkRequest struct {`
`79`	`79`	`Type string`
`80`	`80`	`ID string`
`81`	`81`	`Parent string`
	`82`	`+ Pipeline string`
`82`	`83`
`83`	`84`	`Data map[string]interface{}`
`84`	`85`	`}`
`@@ -99,6 +100,9 @@ func (r BulkRequest) bulk(buf bytes.Buffer) error {`
`99`	`100`	`if len(r.Parent) > 0 {`
`100`	`101`	`metaData["_parent"] = r.Parent`
`101`	`102`	`}`
	`103`	`+ if len(r.Pipeline) > 0 {`
	`104`	`+ metaData["pipeline"] = r.Pipeline`
	`105`	`+ }`
`102`	`106`
`103`	`107`	`meta[r.Action] = metaData`
`104`	`108`
Original file line number	Diff line number	Diff line change
`@@ -27,6 +27,10 @@ type Rule struct {`
`27`	`27`
`28`	`28`	`//only MySQL fields in filter will be synced , default sync all fields`
`29`	`29`	Filter []string `toml:"filter"`
	`30`	`+`
	`31`	`+ // Elasticsearch pipeline`
	`32`	`+ // To pre-process documents before indexing`
	`33`	+ Pipeline string `toml:"pipeline"`
`30`	`34`	`}`
`31`	`35`
`32`	`36`	`func newDefaultRule(schema string, table string) *Rule {`