Python: working out whether it’s an expression or a program

Here’s a code fragment I wrote for my project Muesli that tries to figure out whether a fragment of Python code should be parsed as an expression or a program, which is necessary as the Python parser API needs to be told.

int
muesli_python_input_type(const char *python_c_string)
{
  /*
    Suggested heuristic:
    Skip initial docstrings and comments.
    Scan the string, counting depth of parentheses as I go.
    If the first "(" is before the first letter found in the string,
    it's Py_eval_input.

    If there's a digit before the first letter found, it's Py_eval_input.

    If the first run of [a-z0-9_] is followed by (spaces and) an equals
    sign, it's Py_file_input.

    If the first colon is followed by a newline, it's Py_file_input.

    If the first newline is followed by a space, and the parenthesis depth
    there is 0, it's Py_file_input.

    Otherwise, strcmp the text starting at the first letter for each
    Python keyword that can begin a statement, in order of likelihood of
    being at the start (probably "from", "import", "def" coming first);
    if any found, it's Py_file_input.

    Otherwise, it's Py_eval_input.
  */

  const char *p = python_c_string;
  char c;
  int depth = 0;

  while ((c = *p++) != '') {
    switch (c) {
    case '#':
      while ((c = *p++) != '\n') {
	if (c == '') {
	  return Py_eval_input;
	}
      }
      break;
    case '(':
      depth++;			/* fall through */
    case '0': case '1': case '2': case '3': case '4':
    case '5': case '6': case '7': case '8': case '9':
      return Py_eval_input;
    case ')':
      depth--;
      break;
    case ' ':
      continue;
    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
    case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
    case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
    case 'v': case 'w': case 'x': case 'y': case 'z': case '_':
    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
    case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
    case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
    case 'V': case 'W': case 'X': case 'Y': case 'Z':
      {
	const char *word_start = p - 1;
	char *word_end;
	int word_length;
	while ((c = *p++) != '') {
	  if ((!isalnum(c)) && (c != '_')) {
	    break;
	  }
	}
	word_end = p;
	word_length = word_end - word_start;
	while ((c == ' ') || (c == '\t')) {
	  if ((c = *p++) == '') {
	    /* probably just a variable name */
	    return Py_eval_input;
	  }
	}
	if (c == '=') {
	  return Py_file_input;
	}
	if ((strncmp(word_start, "from", 4) == 0)
	    || (strncmp(word_start, "import", 6) == 0)
	    || (strncmp(word_start, "def", 3) == 0)) {
	  return Py_file_input;
	}
      }
      break;
    case ':':
      if (*p == '\n') {
	return Py_file_input;
      }
      break;
    case '\n':
      if ((c = *p++) == '') {
	return Py_file_input;	/* dunno really */
      }
      if ((depth == 0)
	  && ((c == ' ')
	      || (c == '\t'))) {
	return Py_file_input;
      }
      break;
    case '"':
      if (*p++ == '"') {
	if (*p++ == '"') {
	  return Py_file_input;
	}
      }
      break;
    }
  }

  return Py_eval_input;
}

Advertisements

Post a Comment

Required fields are marked *

*
*

%d bloggers like this: