1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# coding:utf-8

import ply.lex as lex
from datetime import date

# Allows to write g="XXX" instead of groups__name="XXX"
shortcuts = {
    'g': 'groups__name',
    's': 'state__name'
}

# =============================
#           LEXER
# =============================

u_ops = ('NOT',)
b_ops = ('AND', 'OR')

tokens = (
   'NUMBER',
   'DATE',
   'STRING',
   'FIELD',
   'U_OP',
   'B_OP',
   'COMPA',
)

literals = '()'

t_COMPA = r'=|[<>]=?|~~?'

def t_STRING(t):
    r'"[^"]*"'
    t.value = t.value[1:-1]
    return t

# dates are in ch_FR format: dd/mm/yyyy
def t_DATE(t):
    r'(?P<day>\d{1,2})/(?P<month>\d{1,2})/(?P<year>\d{4})'
    day = int(t.lexer.lexmatch.group('day'))
    month = int(t.lexer.lexmatch.group('month'))
    year = int(t.lexer.lexmatch.group('year'))
    t.value = date(year,month,day)
    return t

def t_NUMBER(t):
    r'\d+'
    t.value = int(t.value)    
    return t

def t_FIELD(t):
    r'[A-Za-z_][\w_]*'
    if t.value in u_ops:
        t.type = 'U_OP'
    elif t.value in b_ops:
        t.type = 'B_OP'
    return t

def t_error(t):
    raise CompileException(u"Cannot make sense of char: %s" % t.value[0])


# ignore tabs and spaces
t_ignore  = ' \t'

# =============================
#           PARSER
# =============================

import ply.yacc as yacc
from django.db.models import Q

# missing: i* (case insensitive), in, startswith, endswith, day&co, isnull
# TODO: range
compa2lookup = {
    '=': '',
    '~': 'contains',
    '~~': 'regex',
    '>': 'gt',
    '>=': 'gte',
    '<': 'lt',
    '<=': 'lte',
}

def p_expression_b_op(p):
    '''expression : expression B_OP expression'''
    if p[2] == 'AND':
        p[0] = p[1] & p[3]
    elif p[2] == 'OR':
        p[0] = p[1] | p[3]

def p_expression_u_op(p):
    '''expression : U_OP expression'''
    if p[1] == 'NOT':
        p[0] = ~ p[2]

def p_expression_paren(p):
    "expression : '(' expression ')' "
    p[0] = p[2]

def p_expression_ID(p):
    'expression : FIELD COMPA value'
    
    lookup = compa2lookup[p[2]]
    
    try:
        field = shortcuts[p[1]]
    except:
        field = p[1]
    
    if lookup:
        field = '%s__%s' % (field, lookup)

    # In some situations (which ones?), python
    # refuses unicode strings as dict keys for
    # Q(**d)
    field = str(field)
    
    d = {field: p[3]}
    
    p[0] = Q(**d)


def p_value(p):
    '''value : STRING
            | NUMBER
            | DATE'''
    p[0] = p[1]

def p_error(p):
    if p:
        raise CompileException(u"Parsing error around token: %s" % p.value)
    raise CompileException(u"Parsing error: unexpected end of expression")

precedence = (
    ('left', 'B_OP'),
    ('right', 'U_OP'),
)


class CompileException(Exception):
    
    def __init__(self, message):
        self.message = message        

def compile(expr):
    # create separate lexer and parser for each compilation
    # to be thread-safe
    lexer = lex.lex()
    parser = yacc.yacc()
    # now, parse!
    return parser.parse(expr,lexer=lexer)

if __name__ == '__main__':
    
    input = '(modified > 1/4/2011 OR NOT s="OK") AND g=="XXX"'
    
    try:
        print compile(input)
    except CompileException, e:
        print e.message