|
33 | 33 | import javax.xml.xpath.XPathExpressionException;
|
34 | 34 | import javax.xml.xpath.XPathFactory;
|
35 | 35 |
|
| 36 | +import org.w3c.dom.Document; |
36 | 37 | import org.w3c.dom.Element;
|
37 | 38 | import org.w3c.dom.Node;
|
38 | 39 | import org.w3c.dom.NodeList;
|
39 | 40 | import org.w3c.dom.Text;
|
40 | 41 | import org.xml.sax.SAXException;
|
41 | 42 |
|
| 43 | +import opennlp.tools.sentdetect.segment.LanguageRule; |
| 44 | +import opennlp.tools.sentdetect.segment.Rule; |
42 | 45 | import opennlp.tools.util.InvalidFormatException;
|
43 | 46 | import opennlp.tools.util.XmlUtil;
|
44 | 47 | import opennlp.tools.util.model.ArtifactSerializer;
|
@@ -521,4 +524,79 @@ public boolean getBool(String name, boolean defValue) throws InvalidFormatExcept
|
521 | 524 | */
|
522 | 525 | public abstract AdaptiveFeatureGenerator create() throws InvalidFormatException;
|
523 | 526 | }
|
| 527 | + |
| 528 | + public static Map<String, LanguageRule> getLanguageRules(InputStream xmlDescriptionIn) throws IOException { |
| 529 | + Document xmlDocument = createDOM(xmlDescriptionIn); |
| 530 | + Element element = xmlDocument.getDocumentElement(); |
| 531 | + String tagName = element.getTagName(); |
| 532 | + |
| 533 | + Map<String, LanguageRule> mapping = new HashMap<>(); |
| 534 | + if ("languageRules".equals(tagName)) { |
| 535 | + NodeList nodes = element.getChildNodes(); |
| 536 | + for (int i = 0; i < nodes.getLength(); i++) { |
| 537 | + if (nodes.item(i) instanceof Element) { |
| 538 | + Element childElem = (Element)nodes.item(i); |
| 539 | + if ("languageRule".equals(childElem.getTagName())) { |
| 540 | + getRules(mapping, childElem); |
| 541 | + } |
| 542 | + } |
| 543 | + } |
| 544 | + } |
| 545 | + return mapping; |
| 546 | + } |
| 547 | + |
| 548 | + static void getRules(Map<String, LanguageRule> map, Element element) { |
| 549 | + String name = element.getAttribute("name"); |
| 550 | + if (name != null) { |
| 551 | + LanguageRule languageRule = new LanguageRule(name); |
| 552 | + NodeList nodes = element.getChildNodes(); |
| 553 | + for (int i = 0; i < nodes.getLength(); i++) { |
| 554 | + if (nodes.item(i) instanceof Element) { |
| 555 | + Element childElem = (Element)nodes.item(i); |
| 556 | + if ("rule".equals(childElem.getTagName())) { |
| 557 | + getRule(languageRule, childElem); |
| 558 | + } |
| 559 | + } |
| 560 | + } |
| 561 | + map.put(name, languageRule); |
| 562 | + } |
| 563 | + } |
| 564 | + |
| 565 | + static void getRule(LanguageRule languageRule, Element element) { |
| 566 | + String breaking = element.getAttribute("break"); |
| 567 | + String beforeBreak = ""; |
| 568 | + String afterBreak = ""; |
| 569 | + if (breaking != null) { |
| 570 | + NodeList nodes = element.getChildNodes(); |
| 571 | + for (int i = 0; i < nodes.getLength(); i++) { |
| 572 | + if (nodes.item(i) instanceof Element) { |
| 573 | + Element childElem = (Element)nodes.item(i); |
| 574 | + if ("beforeBreak".equals(childElem.getTagName())) { |
| 575 | + Node firstChild = childElem.getFirstChild(); |
| 576 | + Text text = (Text) firstChild; |
| 577 | + if (text != null) { |
| 578 | + beforeBreak = text.getWholeText(); |
| 579 | + } else { |
| 580 | + beforeBreak = ""; |
| 581 | + } |
| 582 | + } |
| 583 | + if ("afterBreak".equals(childElem.getTagName())) { |
| 584 | + Node firstChild = childElem.getFirstChild(); |
| 585 | + Text text = (Text) firstChild; |
| 586 | + if (text != null) { |
| 587 | + afterBreak = text.getWholeText(); |
| 588 | + } else { |
| 589 | + afterBreak = ""; |
| 590 | + } |
| 591 | + } |
| 592 | + } |
| 593 | + } |
| 594 | + if ("yes".equals(breaking)) { |
| 595 | + languageRule.addRule(new Rule(true, beforeBreak, afterBreak)); |
| 596 | + } |
| 597 | + if ("no".equals(breaking)) { |
| 598 | + languageRule.addRule(new Rule(false, beforeBreak, afterBreak)); |
| 599 | + } |
| 600 | + } |
| 601 | + } |
524 | 602 | }
|
0 commit comments