文档章节

Spark DecisonTree DebugString Parser

Acce1erator
 Acce1erator
发布于 2017/03/10 16:10
字数 955
阅读 13
收藏 0
public final class DebugStringParser {

    private int lookahead = 0;
    private final int len;
    private final String source;

    public DebugStringParser(String s) {
        if(s == null || s.isEmpty())
            throw new IllegalArgumentException("empty string!");
        this.source = s;
        this.len = s.length();
    }

    /**
     * Grammar:
     <p>S -> ' '*</p>
     <p>IF -> If (feature INT <= DOUBLE)</p>
     <p>ELSE -> Else (feature INT > DOUBLE)</p>
     <p>PREDICT -> Predict: DOUBLE</p>
     <p>INT -> [+-]([1-9][0-9]+|0)</p>
     <p>DOUBLE -> INT(\.[0-9]+)?([eE]INT)?</p>

     <p>TREE -> IF\nTREE\nELSE\nTREE | PREDICT </p>
     * @return
     */
    public Node parseAndGetRootNode(){
        lookahead = 0;
        return mathTree();
    }

    private Node mathTree(){
        matchSpaces();
        if(lookahead <len && source.charAt(lookahead) == 'P'){
            double value =  matchPredict();
            return new Node(null,null,true,-1,value);
        }
        ConditionEntry c1 = matchCondition(true);
        matchLineBreaker();
        matchSpaces();
        Node left = mathTree();
        matchLineBreaker();
        matchSpaces();
        ConditionEntry c2 = matchCondition(false);
        if(c1.index != c2.index || c1.value != c2.value)
            throw new IllegalArgumentException("condition not match!");
        matchLineBreaker();
        Node right = mathTree();
        return new Node(left,right,false,c1.index,c1.value);
    }

    private void matchSpaces(){
        while(lookahead < len && source.charAt(lookahead) == ' ')
            lookahead ++;
    }

    private void matchLineBreaker(){
        if(lookahead >= len || source.charAt(lookahead++) != '\n')
            throw new IllegalArgumentException("line breaker is required.");
    }

    private ConditionEntry matchCondition(boolean isIfBranch){
        if(isIfBranch)
            matchString("If (feature ");
        else
            matchString("Else (feature ");
        int mark = lookahead;
        matchInt();
        int index = Integer.parseInt(source.substring(mark,lookahead));
        if(isIfBranch)
            matchString(" <= ");
        else
            matchString(" > ");
        mark = lookahead;
        matchDouble();
        double value = Double.parseDouble(source.substring(mark,lookahead));
        if(lookahead >= len || source.charAt(lookahead++)!=')')
            throw new IllegalArgumentException("')' is required.");
        return new ConditionEntry(index,value);
    }

    private static final class ConditionEntry{
        final int index;
        final double value;

        ConditionEntry(int index, double value) {
            this.index = index;
            this.value = value;
        }
    }

    private double matchPredict(){
        matchString("Predict: ");
        int mark = lookahead;
        matchDouble();
        return Double.parseDouble(source.substring(mark,lookahead));
    }

    private void matchInt(){
        char c;
        if(lookahead < len && ((c=source.charAt(lookahead)) == '+' || c == '-'))
            lookahead ++;
        if(lookahead <len && source.charAt(lookahead) == '0'){
            lookahead ++;
            return;
        }
        if(lookahead >= len || ((c=source.charAt(lookahead ++)) < '1') || c > '9')
            throw new IllegalArgumentException("[1-9] is expected.");
        while(lookahead < len && (c=source.charAt(lookahead)) >= '0' && c <='9')
            lookahead++;
    }

    private void matchDouble(){
        char c;
        matchInt();
        if(lookahead < len && source.charAt(lookahead) == '.'){
            lookahead ++;
            while(lookahead < len && (c=source.charAt(lookahead)) >= '0' && c <='9')
                lookahead++;
        }
        if(lookahead <len && ((c = source.charAt(lookahead)) == 'e' || c == 'E')){
            lookahead++;
            matchInt();
        }
    }

    private void matchString(String s){
        for(int i=0,j=s.length();i<j;i++){
            if(lookahead >= len || s.charAt(i) != source.charAt(lookahead++))
                throw new IllegalArgumentException("expect '" + s + "' at " + (lookahead-i));
        }
    }

    public static void main(String[] args) {
        String s = "If (feature 9 <= 0.0125)\n" +
                "     If (feature 10 <= 0.0114)\n" +
                "      If (feature 12 <= 0.0075)\n" +
                "       If (feature 0 <= 0.0065)\n" +
                "        If (feature 1 <= 0.0058)\n" +
                "         Predict: 0.047923389851888445\n" +
                "        Else (feature 1 > 0.0058)\n" +
                "         Predict: 0.07137635126022983\n" +
                "       Else (feature 0 > 0.0065)\n" +
                "        If (feature 12 <= 0.0055)\n" +
                "         Predict: 0.08800853325349002\n" +
                "        Else (feature 12 > 0.0055)\n" +
                "         Predict: 0.11735270545200469\n" +
                "      Else (feature 12 > 0.0075)\n" +
                "       If (feature 0 <= 0.0093)\n" +
                "        If (feature 7 <= 0.0101)\n" +
                "         Predict: 0.10974269542143679\n" +
                "        Else (feature 7 > 0.0101)\n" +
                "         Predict: 0.14264542094310068\n" +
                "       Else (feature 0 > 0.0093)\n" +
                "        If (feature 6 <= 0.0132)\n" +
                "         Predict: 0.15816845060656223\n" +
                "        Else (feature 6 > 0.0132)\n" +
                "         Predict: 0.22484364604125084\n" +
                "     Else (feature 10 > 0.0114)\n" +
                "      If (feature 0 <= 0.0149)\n" +
                "       If (feature 7 <= 0.011)\n" +
                "        If (feature 2 <= 0.0199)\n" +
                "         Predict: 0.17659115093907074\n" +
                "        Else (feature 2 > 0.0199)\n" +
                "         Predict: 0.11897248764689246\n" +
                "       Else (feature 7 > 0.011)\n" +
                "        If (feature 10 <= 0.0137)\n" +
                "         Predict: 0.19971164036377678\n" +
                "        Else (feature 10 > 0.0137)\n" +
                "         Predict: 0.23499119198104446\n" +
                "      Else (feature 0 > 0.0149)\n" +
                "       If (feature 2 <= 0.0355)\n" +
                "        If (feature 10 <= 0.0169)\n" +
                "         Predict: 0.19316578816705413\n" +
                "        Else (feature 10 > 0.0169)\n" +
                "         Predict: 0.27050388273012166\n" +
                "       Else (feature 2 > 0.0355)\n" +
                "        If (feature 1 <= 0.0164)\n" +
                "         Predict: 0.10299145299145299\n" +
                "        Else (feature 1 > 0.0164)\n" +
                "         Predict: 0.14485303437882907\n" +
                "    Else (feature 9 > 0.0125)\n" +
                "     If (feature 12 <= 0.0222)\n" +
                "      If (feature 12 <= 0.0025)\n" +
                "       If (feature 3 <= 0.0136)\n" +
                "        If (feature 4 <= 0.0163)\n" +
                "         Predict: 0.16205533596837945\n" +
                "        Else (feature 4 > 0.0163)\n" +
                "         Predict: 0.07920792079207921\n" +
                "       Else (feature 3 > 0.0136)\n" +
                "        If (feature 9 <= 0.2019)\n" +
                "         Predict: 0.9225040850767459\n" +
                "        Else (feature 9 > 0.2019)\n" +
                "         Predict: 0.5019334880123744\n" +
                "      Else (feature 12 > 0.0025)\n" +
                "       If (feature 3 <= 0.0759)\n" +
                "        If (feature 7 <= 0.0217)\n" +
                "         Predict: 0.20286529220528218\n" +
                "        Else (feature 7 > 0.0217)\n" +
                "         Predict: 0.7116316639741519\n" +
                "       Else (feature 3 > 0.0759)\n" +
                "        If (feature 12 <= 0.0082)\n" +
                "         Predict: 0.1456244234832029\n" +
                "        Else (feature 12 > 0.0082)\n" +
                "         Predict: 0.6139024177696873\n" +
                "     Else (feature 12 > 0.0222)\n" +
                "      If (feature 3 <= 0.0136)\n" +
                "       If (feature 0 <= 0.0149)\n" +
                "        If (feature 14 <= 0.0089)\n" +
                "         Predict: 0.11330472103004292\n" +
                "        Else (feature 14 > 0.0089)\n" +
                "         Predict: 0.16452830188679246\n" +
                "       Else (feature 0 > 0.0149)\n" +
                "        If (feature 11 <= 0.0167)\n" +
                "         Predict: 0.17938517179023508\n" +
                "        Else (feature 11 > 0.0167)\n" +
                "         Predict: 0.27445605619325\n" +
                "      Else (feature 3 > 0.0136)\n" +
                "       If (feature 2 <= 0.0355)\n" +
                "        If (feature 4 <= 0.0186)\n" +
                "         Predict: 0.7787088347055098\n" +
                "        Else (feature 4 > 0.0186)\n" +
                "         Predict: 0.9376800209478922\n" +
                "       Else (feature 2 > 0.0355)\n" +
                "        If (feature 3 <= 0.0759)\n" +
                "         Predict: 0.9172398148052672\n" +
                "        Else (feature 3 > 0.0759)\n" +
                "         Predict: 0.985060246603449";
        DebugStringParser parser = new DebugStringParser(s);
        System.out.println(parser.parseAndGetRootNode());
    }

}


public final class Node implements Serializable {

    private static final long serialVersionUID = 273479971015393598L;

    private final Node left;
    private final Node right;

    private final boolean leaf;
    private final int featureIndex;
    private final double featureValue;

    Node(Node left, Node right, boolean leaf, int featureIndex, double featureValue) {
        if((!leaf) ? (left == null && right ==null):(left != null && right != null))
            throw new IllegalArgumentException("illegal leaf:"+ leaf);
        this.left = left;
        this.right = right;
        this.leaf = leaf;
        this.featureIndex = featureIndex;
        this.featureValue = featureValue;
    }

    public Node getLeft() {
        return left;
    }

    public Node getRight() {
        return right;
    }

    public boolean isLeaf() {
        return leaf;
    }

    public int getFeatureIndex() {
        return featureIndex;
    }

    public double getFeatureValue() {
        return featureValue;
    }

    @Override
    public String toString() {
        StringBuilder builder = new StringBuilder();
        toString(builder,"");
        return builder.toString();
    }

    private void toString(StringBuilder builder,String prefix){
        if(isLeaf()){
            builder.append(prefix)
                    .append("Predict: ")
                    .append(featureValue);
            return;
        }
        builder.append(prefix)
                .append("If (feature ")
                .append(featureIndex)
                .append(" <= ")
                .append(featureValue)
                .append(")\n");
        String tab = "  " + prefix;
        left.toString(builder, tab);
        builder
                .append('\n')
                .append(prefix)
                .append("Else (feature ")
                .append(featureIndex)
                .append(" > ")
                .append(featureValue)
                .append(")\n");
        right.toString(builder, tab);
    }
}

 

© 著作权归作者所有

Acce1erator
粉丝 23
博文 25
码字总数 18001
作品 0
朝阳
程序员
私信 提问
扩展Spark Catalyst,打造自定义的Spark SQL引擎 - 知乎

扩展Spark Catalyst,打造自定义的Spark SQL引擎 Apache Spark是大数据处理领域最常用的计算引擎之一,被应用在各种各样的场景中,除了易用的API,稳定高效的处理引擎,可扩展性也是Spark能够...

大数据技术与实践
前天
0
0
扩展Spark Catalyst,打造自定义的Spark SQL引擎

Apache Spark是大数据处理领域最常用的计算引擎之一,被应用在各种各样的场景中,除了易用的API,稳定高效的处理引擎,可扩展性也是Spark能够得到广泛应用的一个重要原因。Spark中最常见的扩...

李呈祥
2018/11/20
0
0
一条 SQL 在 Apache Spark 之旅(上)

Spark SQL 是 Spark 众多组件中技术最复杂的组件之一,它同时支持 SQL 查询和 DataFrame DSL。通过引入了 SQL 的支持,大大降低了开发人员的学习和使用成本。目前,整个 SQL 、Spark ML、Spa...

Spark
06/12
0
0
Apache Spark 1.6.2 发布,集群计算环境

Apache Spark 1.6.2 发布了,Apache Spark 是一种与 Hadoop 相似的开源集群计算环境,但是两者之间还存在一些不同之处,这些有用的不同之处使 Spark 在某些工作负载方面表现得更加优越,换句...

愚_者
2016/06/28
3.9K
1
Apache Spark 2.4.0 正式发布

Apache Spark 2.4 与昨天正式发布,Apache Spark 2.4 版本是 2.x 系列的第五个版本。 如果想及时了解 Spark、Hadoop或者Hbase相关的文章,欢迎关注微信公共帐号: itebloghadoop Apache Spa...

Spark
2018/11/09
0
0

没有更多内容

加载失败,请刷新页面

加载更多

3_数组

3_数组

行者终成事
今天
7
0
经典系统设计面试题解析:如何设计TinyURL(二)

原文链接:https://www.educative.io/courses/grokking-the-system-design-interview/m2ygV4E81AR 编者注:本文以一道经典的系统设计面试题:《如何设计TinyURL》的参考答案和解析为例,帮助...

APEMESH
今天
7
0
使用logstash同步MySQL数据到ES

概述   在生成业务常有将MySQL数据同步到ES的需求,如果需要很高的定制化,往往需要开发同步程序用于处理数据。但没有特殊业务需求,官方提供的logstash就很有优势了。   在使用logstas...

zxiaofan666
今天
10
0
X-MSG-IM-分布式信令跟踪能力

经过一周多的鏖战, X-MSG-IM的分布式信令跟踪能力已基本具备, 特点是: 实时. 只有要RX/TX就会实时产生信令跟踪事件, 先入kafka, 再入influxdb待查. 同时提供实时sub/pub接口. 完备. 可以完整...

dev5
今天
7
0
OpenJDK之CyclicBarrier

OpenJDK8,本人看的是openJDK。以前就看过,只是经常忘记,所以记录下 图1 CyclicBarrier是Doug Lea在JDK1.5中引入的,作用就不详细描述了,主要有如下俩个方法使用: await()方法,如果当前线...

克虏伯
今天
8
0

没有更多内容

加载失败,请刷新页面

加载更多

返回顶部
顶部