Source code for src.tools.parsing

"""Tools for parsing strings and extracting certain types of information.
"""
import src.core


[docs]def findClosingParenthesis(expression, start=0):
    """Find a matching parenthesis in an expression.
    
    Parameters
    ----------
    expression : str
        The expression in which the closing parenthesis is sought.
    start : int
        The position of the first character **after** the opening parenthesis
        in the expression.
    
    Returns
    -------
    int
        The position of the closing parenthesis, or -1 if no closing 
        parenthesis is found.
        
    Examples
    --------
    >>> findClosingParenthesis("3 + @(cat) + @(dog)", 6)
    9
    """
    depth = 1
    position = start
    keepGoing = True
    
    while keepGoing and position <= len(expression):
        currChar = expression[position]
        if currChar == '(':
            depth += 1
        elif currChar == ')':
            depth -= 1
            if depth == 0:
                return position
        position += 1
    return -1

[docs]def extractNamesOfType(expression, marker):
    """Extract the names of bins which begin with a given character.
    
    Parameters
    ----------
    expression : str
        The expression from which to extract the names of the bins of the
        desired type.
    marker : str
        The character or string which indicates the beginning of a bin name
        of the desired type.
    
    Returns
    -------
    list of str
        A list of strings indicating the names of the bins of the relevant 
        type.
        
    Examples
    --------
    >>> extractNamesOfType("3 + @(cat)*$(fish) + @(dog)/#(mouse)", "@")
    ['cat', 'dog']
    """
    index = 0
    result = []
    matchString = marker + '('
    length = len(matchString)
    
    while index < len(expression):
        index = expression.find(matchString, index)
        if index < 0:
            return result
        endPos = findClosingParenthesis(expression, index+length)
        if endPos < 0:
            return result
        else:
            result.append(expression[index+length:endPos])
            index = endPos
    return result

[docs]def extractNames(expression):
    """Extract the names of all data bins in some expression.
    
    Parameters
    ----------
    expression : str
        The expression from which to extract the names of data bins.
        
    Returns
    -------
    tuple of list of str
        A tuple of lists of strings. The first list contains the names of
        all constants used in the expression. The second contains the names of
        columns, and the third contains the names of parameters.
    
    Examples
    --------
    >>> extractNames("3 + @(cat) + @(dog)")
    (['cat', 'dog'], [], [])
    
    >>> extractNames("3 + @(cat)*$(fish) + @(dog)/#(mouse)")
    (['cat', 'dog'], ['mouse'], ['fish'])
    """
    constants = extractNamesOfType(expression, 
                                   src.core.experiment.MARK_CONSTANT)
    columns = extractNamesOfType(expression, 
                                 src.core.experiment.MARK_COLUMN)
    parameters = extractNamesOfType(expression, 
                                    src.core.experiment.MARK_PARAMETER)
    return (constants, columns, parameters)


[docs]def tokenize(string, delimiter=','):
    """Split the string at the specified delimiter.
    
    This function works similar to the built-in string function `split` except
    that it takes into account the possibility that the delimiter occurs 
    inside some grouping construction (for example, quotation marks or list
    brackets) which should prevent splitting.
    
    Parameters
    ----------
    string : str
        The string to split. If this string begins with a character which
        marks the start of a group, the matching closing character will end
        the tokenized list.
    delimiter : str
        The mark at which to split.
        
    Returns
    -------
    list of str
        The list of tokens in the string.
    str
        The contents of the string following the group-closing character which
        matches the character with which the input string started.
    """
    
    tokens = []    # All tokens in the string
    token = ''     # The token currently being filled
    
    dqo = False    # Double quotes open
    sqo = False    # Single quotes open
    paren = 0      # Depth of parentheses
    brace = 0      # Depth of curly braces
    brack = 0      # Depth of square brackets
    
    if not (string.startswith('(') or string.startswith('[') or
            string.startswith('{') or string.startswith('"') or 
            string.startswith("'")):
        paren += 1
    
    def getLevel():
        """Get the current next level."""
        return int(dqo) + int(sqo) + paren + brack + brace
    
    index = 0
    length = len(string)
    while index < length :
        char = string[index]
        if char == delimiter and getLevel() == 1:
            tokens.append(token.strip())
            token = ''
            index += 1
            continue
        
        if char == ')':
            paren -= 1
        elif char == ']':
            brack -= 1
        elif char == '}':
            brace -= 1
            
        if getLevel() > 1 or (getLevel() > 0 and char != delimiter):
            token += char
            
        if char == '(':
            paren += 1
        elif char == '[':
            brack += 1
        elif char == '{':
            brace += 1
            
        elif char == '"':
            if dqo:
                dqo = False
            else:
                dqo = True
        elif char == "'":
            if sqo:
                sqo = False
            else:
                sqo = True
                
        index += 1
        
        if getLevel() == 0:
            break
        
    tokens.append(token.strip())
    return (tokens, string[index:])

[docs]def escapeXML(string):
    """Return an XML compliant string.
    
    Parameters
    ----------
    string : str
        A string which may or may not contain characters which would be invalid
        in an XML document.
        
    Returns
    -------
    str
        The input string with all improper characters replaced with appropriate
        escape sequences.
    """
    string = str(string)
    for i, j in [('&', '&amp;'), ('"', '&quot;'), ("'", '&apos;'), 
                 ('>', '&gt;'), ('<', '&lt;')]:
        string = string.replace(i, j)
    return string
Navigation

Source code for src.tools.parsing

Quick search

Navigation