I'm writing a translator for uni project which should translate given Pascal code into assembler code using flex/bison. I've written parser and lexer, which generates symbol table (atm works correctly only without procedures and functions). And my question is, how do I generate assembler code from it and print it to file.

我正在为uni项目编写一个翻译器,它应该使用flex / bison将给定的Pascal代码转换为汇编代码。我编写了解析器和词法分析器,它生成符号表(atm只在没有程序和函数的情况下正常工作)。我的问题是,如何从中生成汇编代码并将其打印到文件中。

Here is my lexer:


#include "parser.tab.h"
#include <string.h>
#define YY_FLEX_DEBUG 1

letter      [a-zA-Z]
digit       [0-9]
ID          {letter}({letter}|{digit})*
delim       [ \t\n]
NUM         {digit}+(\.{digit}+)?(E[+\-]?(digit)+)?
ws          {delim}+

{ws}        {                                           }
if          {return(IF);                                }
then        {return(THEN);                              }
else        {return(ELSE);                              }
{NUM}       {yylval.stringValue = strdup(yytext); return(NUM);          }
"<"         {yylval.stringValue = "<"; return(RELOP);   }
"<="        {yylval.stringValue = "<="; return(RELOP);  }
"="         {yylval.stringValue = "="; return(RELOP);   }
">"         {yylval.stringValue = ">"; return(RELOP);   }
">="        {yylval.stringValue = ">="; return(RELOP);  }
"<>"        {yylval.stringValue = "<>"; return(RELOP);  }
":="        {return(ASSIGNOP);                          }
do          {return(DO);                                }
program     {return(PROGRAM);                           }
var         {return(VAR);                               }
array       {return(ARRAY);                             }
of          {return(OF);                                }
integer     {return(INTEGER);                           }
real        {return(REAL);                              }
function    {return(FUNCTION);                          }
procedure   {return(PROCEDURE);                         }
begin       {return(START);                             }
end         {return(END);                               }
div         {yylval.stringValue = "div"; return(MULOP); }
mod         {yylval.stringValue = "mod"; return(MULOP); }
and         {yylval.stringValue = "and"; return(MULOP); }
"*"         {yylval.stringValue = "*"; return(MULOP);   }
"/"         {yylval.stringValue = "/"; return(MULOP);   }
while       {return(WHILE);                             }
or          {return(OR);                                }
"+"         {yylval.stringValue = "+"; return(SIGN);    }
"-"         {yylval.stringValue = "-"; return(SIGN);    }
".."        {return(DOUBLEDOT);                         }
","         {return *yytext;                            }
"("         {return *yytext;                            }
")"         {return *yytext;                            }
"["         {return *yytext;                    }
"]"         {return *yytext;                    }
";"         {return *yytext;                                }
":"         {return *yytext;                                }
"."         {return *yytext;                                }
not         {return(NOT);                               }
{ID}        {yylval.stringValue= strdup(yytext); return(ID);}
int yywrap(void){}

Here is my parser:


    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include "SymbolTable.h"
    int errors;
    int lable;
    #define YYDEBUG 1

    install (char *sym_name)
        symrec *s;
        s = getsym(sym_name);
        if (s == 0)
            s = putsym(sym_name);
        else {
            printf("%s is defined\n", sym_name);

    install_num (char *sym_name)
        symrec *s;
        s = getsym(sym_name);
        if (s == 0)
            s = putnum(sym_name);

    context_check(char *sym_name)
        if (getsym(sym_name) == 0)
            printf("%s is undeclared\n", sym_name);
    int intValue;
    float floatValue;
    char *stringValue;
    int adress;
%start program
%token <stringValue> ID
%token <stringValue> NUM
%token START END
%token SIGN OR
%token NOT
%left '-' '+'
%left '*' '/'
program: PROGRAM ID '(' prog_list ')' ';' declarations subprogram_declarations compound_statement '.'
prog_list: ID
         | prog_list ',' ID
identifier_list: ID  {install($1);}
         | identifier_list ',' ID {install($3);} 
declarations: declarations VAR identifier_list ':' type ';'
         | /* empty */
type: standart_type
         | ARRAY '[' NUM DOUBLEDOT NUM ']' OF REAL {set_type("REALARR");}
         | ARRAY '[' NUM DOUBLEDOT NUM ']' OF INTEGER {set_type("INTARR");}
standart_type: INTEGER {set_type("INTEGER");}
         | REAL {set_type("REAL");}
subprogram_declarations: subprogram_declarations subprogram_declaration ';'
         | /* empty */
subprogram_declaration: subprogram_head declarations compound_statement;
subprogram_head: FUNCTION ID arguments ':' INTEGER ';' {install($2); set_type("INTEGER");}
         | FUNCTION ID arguments ':' REAL ';' {install($2); set_type("REAL");}
         | PROCEDURE ID arguments ';' {install($2); set_proc($2);}
arguments: '(' parameter_list ')'
         | /* empty */;
parameter_list: identifier_list ':' type
         | parameter_list ';' identifier_list ':' type
compound_statement: START
                    optional_statements END
optional_statements: statement_list
         | /* empty */
statement_list: statement
         | statement_list ';' statement
statement: variable ASSIGNOP expression
         | procedure_statement
         | compound_statement
         | IF expression THEN statement ELSE statement
         | WHILE expression DO statement
variable: ID {context_check($1);}
         | ID '[' expression ']' {context_check($1);}
procedure_statement: ID 
         | ID '(' expression_list ')'
expression_list: expression
         | expression_list ',' expression
expression: simple_expression
         | simple_expression RELOP simple_expression
simple_expression: term
         | SIGN term
         | simple_expression SIGN term
         | simple_expression OR term
term: factor
         | term MULOP factor
factor: variable
         | ID '(' expression_list ')' {context_check($1);}
         | NUM {install_num($1);}
         | '(' expression ')'
         | NOT factor
main (int argc, char *argv[]) {
    FILE *output = fopen("output.asm", "w");
    fprintf(output, "\t  jump.i #lab0\n");
    extern FILE *yyin;
    ++argv; --argc;
    yyin = fopen(argv[0], "r");
    yydebug = 1;
    errors = 0;
    fprintf(output, "\t  exit");

yyerror (char *s) /* Called by yyparse on error */
    printf ("%s\n", s);

Here is symbol table:


struct symrec
    char *name;
    int addr;
    char *type;
    struct symrec *next; 
typedef struct symrec symrec;
symrec *sym_table = (symrec *)0;
symrec *putsym();
symrec *getsym();
symrec *putnum();
void set_type();
void set_proc();
void set_func();
void print_sym_table();

symrec *putsym(char *sym_name)
    symrec *ptr;
    ptr = (symrec *)malloc(sizeof(symrec));
    ptr->name = (char *)malloc(strlen(sym_name) + 1);
    ptr->type = NULL;
    ptr->next = (struct symrec *)sym_table;
    sym_table = ptr;
    return ptr;

symrec *putnum(char *sym_name)
    symrec *ptr;
    char *dPos = strchr(sym_name, '.');
    char *ePos = strchr(sym_name, 'e');
    ptr = (symrec *)malloc(sizeof(symrec));
    ptr->name = (char *)malloc(strlen(sym_name) + 1);
    if ((dPos == NULL) && (ePos == NULL)){
        ptr->type = (char *)malloc(strlen("INTEGER") + 1);
        strcpy(ptr->type, "INTEGER");
    else if ((dPos != NULL) && (ePos == NULL)) {
        ptr->type = (char *)malloc(strlen("REAL") + 1);
        strcpy(ptr->type, "REAL");
    else {
        ptr->type = (char *)malloc(strlen("FLOAT") + 1);
        strcpy(ptr->type, "FLOAT");
    ptr->next = (struct symrec *)sym_table;
    sym_table = ptr;
    return ptr;

void set_type(char *type)
    symrec *ptr;
    for (ptr = sym_table; ptr != (symrec *)0; ptr = (symrec *)ptr->next) {
        if (ptr->type == NULL) {
            ptr->type = (char *)malloc(strlen(type) + 1);
            strcpy(ptr->type, type);

void set_proc(char *sym_name) {
    symrec *ptr;
    for (ptr = sym_table; ptr != (symrec *)0; ptr = (symrec *)ptr->next)
        if (strcmp (ptr->name, sym_name) == 0){
            ptr->type = (char *)malloc(strlen("PROC") + 1);
            strcpy(ptr->type, "PROC");

symrec *getsym(char *sym_name)
    symrec *ptr;
    for (ptr = sym_table; ptr != (symrec *)0; ptr = (symrec *)ptr->next)
        if (strcmp (ptr->name, sym_name) == 0)
            return ptr;
    return 0;

void print_sym_table()
    symrec *ptr;
    for (ptr = sym_table; ptr != (symrec *)0; ptr = (symrec *)ptr->next)
        printf("\n%s    %s\n", ptr->name, ptr->type);

Simple test file


program example(input, output);
var x, y: integer;
var g,h:real;


And what it should print to the output file:


     jump.i  #lab0                   ;jump.i  lab0
        add.i   0,4,24                  ;add.i   x,y,$t0
        inttoreal.i 24,28               ;inttoreal.i $t0,$t1
        mov.r   28,8                    ;mov.r   $t1,g
        write.r 8                       ;write.r g
        exit                            ;exit    

comments (;jump.i lab0) are not necessary.

注释(; jump.i lab0)不是必需的。

I know how adresses of variables should be calculated and I can translate pascal code to this assembler on paper, but I really don't understand where and what should I put in bison or flex file so it would generate assembler code into output file. I've tried to generate labels for begin statements in rule :


compound_statement: START {fprintf(output, "lab0\n");}
                    optional_statements END

But it got segmentation fault. It's pretty obvious how to generate labels, but how should I generate


add.i 0, 4, 24

Should I create another parser after I've built symbol table with this one? Or is it doable without additional parser. Need some hints what to do next.


So you've got this bit of code:


compound_statement: START {fprintf(output, "lab0\n");}
                    optional_statements END

You're on the right track doing it this way, but you get a segmentation fault when you add it in and this is because output isn't initialised.


I can't see where you've declared the output that is being referenced there, but it isn't the same one that is declared in main where you open a file for output.


main (int argc, char *argv[]) {
    FILE *output = fopen("output.asm", "w");

That version output is local to main and only visible inside that function. If you remove the declaration of output from main and leave just the assignment, you'll be assigning the results of fopen to the globally declared version of output that your bison code is using.


main (int argc, char *argv[]) {
    output = fopen("output.asm", "w");

Not sure why you're having confusion with the other part of your question since you've demonstrated how to do it already in your parser. Take this bit of your parser:


variable: ID {context_check($1);}

It is taking the value of "ID" - the $1 - and passing it to that function. If you wanted "variable" to contain a value you'd store it in $$. Then when you use "variable" higher up like in here:

它取“ID”的值 - $ 1 - 并将其传递给该函数。如果您希望“变量”包含值,则将其存储在$$中。然后当你使用更高的“变量”时,就像在这里:

statement: variable ASSIGNOP expression

$1 will contain whatever value you put in $$ for "variable". $2 will be the value obtained from "ASSIGNOP" token and $3 will have the results from "expression". And again if you store a value in $$ you'd be able to use it in anything that is expecting a "statement".

$ 1将包含您在$$中为“变量”添加的任何值。 $ 2将是从“ASSIGNOP”令牌获得的值,$ 3将获得“表达式”的结果。如果你在$$中存储一个值,你就可以在任何期望“声明”的东西中使用它。

$$, $1 etc... are all of the type you've created by using %union, so you can also do $$.intValue or $2.stringValue if you need to specifically state which value you're setting.

$$,$ 1等...都是使用%union创建的所有类型,因此如果需要明确说明要设置的值,也可以执行$$。intValue或$ 2.stringValue。



In your parser, for example, you have a pattern:


| term MULOP factor

You would like to put an action on that pattern that was something like:


{ fprintf(output, "mul term, factor, result\n"); }

but it starts to get sticky very quickly: where are term, factor and where should you put the result? The easiest answer is a stack: whenever an variable is referenced, push its value onto the stack. whenever an operation is matched, pop the operand(s) into registers, perform the operation, and push the result, so the above becomes:


   fprintf(output, "pop r0; pop r1; mul r1, r0, r0;");
   fprintf(output, "push r0\n");

and assignments just pop the stack into a variable.




