OCamlLangImpl3.rst 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961
  1. ========================================
  2. Kaleidoscope: Code generation to LLVM IR
  3. ========================================
  4. .. contents::
  5. :local:
  6. Chapter 3 Introduction
  7. ======================
  8. Welcome to Chapter 3 of the "`Implementing a language with
  9. LLVM <index.html>`_" tutorial. This chapter shows you how to transform
  10. the `Abstract Syntax Tree <OCamlLangImpl2.html>`_, built in Chapter 2,
  11. into LLVM IR. This will teach you a little bit about how LLVM does
  12. things, as well as demonstrate how easy it is to use. It's much more
  13. work to build a lexer and parser than it is to generate LLVM IR code. :)
  14. **Please note**: the code in this chapter and later require LLVM 2.3 or
  15. LLVM SVN to work. LLVM 2.2 and before will not work with it.
  16. Code Generation Setup
  17. =====================
  18. In order to generate LLVM IR, we want some simple setup to get started.
  19. First we define virtual code generation (codegen) methods in each AST
  20. class:
  21. .. code-block:: ocaml
  22. let rec codegen_expr = function
  23. | Ast.Number n -> ...
  24. | Ast.Variable name -> ...
  25. The ``Codegen.codegen_expr`` function says to emit IR for that AST node
  26. along with all the things it depends on, and they all return an LLVM
  27. Value object. "Value" is the class used to represent a "`Static Single
  28. Assignment
  29. (SSA) <http://en.wikipedia.org/wiki/Static_single_assignment_form>`_
  30. register" or "SSA value" in LLVM. The most distinct aspect of SSA values
  31. is that their value is computed as the related instruction executes, and
  32. it does not get a new value until (and if) the instruction re-executes.
  33. In other words, there is no way to "change" an SSA value. For more
  34. information, please read up on `Static Single
  35. Assignment <http://en.wikipedia.org/wiki/Static_single_assignment_form>`_
  36. - the concepts are really quite natural once you grok them.
  37. The second thing we want is an "Error" exception like we used for the
  38. parser, which will be used to report errors found during code generation
  39. (for example, use of an undeclared parameter):
  40. .. code-block:: ocaml
  41. exception Error of string
  42. let context = global_context ()
  43. let the_module = create_module context "my cool jit"
  44. let builder = builder context
  45. let named_values:(string, llvalue) Hashtbl.t = Hashtbl.create 10
  46. let double_type = double_type context
  47. The static variables will be used during code generation.
  48. ``Codgen.the_module`` is the LLVM construct that contains all of the
  49. functions and global variables in a chunk of code. In many ways, it is
  50. the top-level structure that the LLVM IR uses to contain code.
  51. The ``Codegen.builder`` object is a helper object that makes it easy to
  52. generate LLVM instructions. Instances of the
  53. `IRBuilder <http://llvm.org/doxygen/IRBuilder_8h-source.html>`_
  54. class keep track of the current place to insert instructions and has
  55. methods to create new instructions.
  56. The ``Codegen.named_values`` map keeps track of which values are defined
  57. in the current scope and what their LLVM representation is. (In other
  58. words, it is a symbol table for the code). In this form of Kaleidoscope,
  59. the only things that can be referenced are function parameters. As such,
  60. function parameters will be in this map when generating code for their
  61. function body.
  62. With these basics in place, we can start talking about how to generate
  63. code for each expression. Note that this assumes that the
  64. ``Codgen.builder`` has been set up to generate code *into* something.
  65. For now, we'll assume that this has already been done, and we'll just
  66. use it to emit code.
  67. Expression Code Generation
  68. ==========================
  69. Generating LLVM code for expression nodes is very straightforward: less
  70. than 30 lines of commented code for all four of our expression nodes.
  71. First we'll do numeric literals:
  72. .. code-block:: ocaml
  73. | Ast.Number n -> const_float double_type n
  74. In the LLVM IR, numeric constants are represented with the
  75. ``ConstantFP`` class, which holds the numeric value in an ``APFloat``
  76. internally (``APFloat`` has the capability of holding floating point
  77. constants of Arbitrary Precision). This code basically just creates
  78. and returns a ``ConstantFP``. Note that in the LLVM IR that constants
  79. are all uniqued together and shared. For this reason, the API uses "the
  80. foo::get(..)" idiom instead of "new foo(..)" or "foo::Create(..)".
  81. .. code-block:: ocaml
  82. | Ast.Variable name ->
  83. (try Hashtbl.find named_values name with
  84. | Not_found -> raise (Error "unknown variable name"))
  85. References to variables are also quite simple using LLVM. In the simple
  86. version of Kaleidoscope, we assume that the variable has already been
  87. emitted somewhere and its value is available. In practice, the only
  88. values that can be in the ``Codegen.named_values`` map are function
  89. arguments. This code simply checks to see that the specified name is in
  90. the map (if not, an unknown variable is being referenced) and returns
  91. the value for it. In future chapters, we'll add support for `loop
  92. induction variables <LangImpl5.html#for-loop-expression>`_ in the symbol table, and for
  93. `local variables <LangImpl7.html#user-defined-local-variables>`_.
  94. .. code-block:: ocaml
  95. | Ast.Binary (op, lhs, rhs) ->
  96. let lhs_val = codegen_expr lhs in
  97. let rhs_val = codegen_expr rhs in
  98. begin
  99. match op with
  100. | '+' -> build_fadd lhs_val rhs_val "addtmp" builder
  101. | '-' -> build_fsub lhs_val rhs_val "subtmp" builder
  102. | '*' -> build_fmul lhs_val rhs_val "multmp" builder
  103. | '<' ->
  104. (* Convert bool 0/1 to double 0.0 or 1.0 *)
  105. let i = build_fcmp Fcmp.Ult lhs_val rhs_val "cmptmp" builder in
  106. build_uitofp i double_type "booltmp" builder
  107. | _ -> raise (Error "invalid binary operator")
  108. end
  109. Binary operators start to get more interesting. The basic idea here is
  110. that we recursively emit code for the left-hand side of the expression,
  111. then the right-hand side, then we compute the result of the binary
  112. expression. In this code, we do a simple switch on the opcode to create
  113. the right LLVM instruction.
  114. In the example above, the LLVM builder class is starting to show its
  115. value. IRBuilder knows where to insert the newly created instruction,
  116. all you have to do is specify what instruction to create (e.g. with
  117. ``Llvm.create_add``), which operands to use (``lhs`` and ``rhs`` here)
  118. and optionally provide a name for the generated instruction.
  119. One nice thing about LLVM is that the name is just a hint. For instance,
  120. if the code above emits multiple "addtmp" variables, LLVM will
  121. automatically provide each one with an increasing, unique numeric
  122. suffix. Local value names for instructions are purely optional, but it
  123. makes it much easier to read the IR dumps.
  124. `LLVM instructions <../LangRef.html#instruction-reference>`_ are constrained by strict
  125. rules: for example, the Left and Right operators of an `add
  126. instruction <../LangRef.html#add-instruction>`_ must have the same type, and the
  127. result type of the add must match the operand types. Because all values
  128. in Kaleidoscope are doubles, this makes for very simple code for add,
  129. sub and mul.
  130. On the other hand, LLVM specifies that the `fcmp
  131. instruction <../LangRef.html#fcmp-instruction>`_ always returns an 'i1' value (a
  132. one bit integer). The problem with this is that Kaleidoscope wants the
  133. value to be a 0.0 or 1.0 value. In order to get these semantics, we
  134. combine the fcmp instruction with a `uitofp
  135. instruction <../LangRef.html#uitofp-to-instruction>`_. This instruction converts its
  136. input integer into a floating point value by treating the input as an
  137. unsigned value. In contrast, if we used the `sitofp
  138. instruction <../LangRef.html#sitofp-to-instruction>`_, the Kaleidoscope '<' operator
  139. would return 0.0 and -1.0, depending on the input value.
  140. .. code-block:: ocaml
  141. | Ast.Call (callee, args) ->
  142. (* Look up the name in the module table. *)
  143. let callee =
  144. match lookup_function callee the_module with
  145. | Some callee -> callee
  146. | None -> raise (Error "unknown function referenced")
  147. in
  148. let params = params callee in
  149. (* If argument mismatch error. *)
  150. if Array.length params == Array.length args then () else
  151. raise (Error "incorrect # arguments passed");
  152. let args = Array.map codegen_expr args in
  153. build_call callee args "calltmp" builder
  154. Code generation for function calls is quite straightforward with LLVM.
  155. The code above initially does a function name lookup in the LLVM
  156. Module's symbol table. Recall that the LLVM Module is the container that
  157. holds all of the functions we are JIT'ing. By giving each function the
  158. same name as what the user specifies, we can use the LLVM symbol table
  159. to resolve function names for us.
  160. Once we have the function to call, we recursively codegen each argument
  161. that is to be passed in, and create an LLVM `call
  162. instruction <../LangRef.html#call-instruction>`_. Note that LLVM uses the native C
  163. calling conventions by default, allowing these calls to also call into
  164. standard library functions like "sin" and "cos", with no additional
  165. effort.
  166. This wraps up our handling of the four basic expressions that we have so
  167. far in Kaleidoscope. Feel free to go in and add some more. For example,
  168. by browsing the `LLVM language reference <../LangRef.html>`_ you'll find
  169. several other interesting instructions that are really easy to plug into
  170. our basic framework.
  171. Function Code Generation
  172. ========================
  173. Code generation for prototypes and functions must handle a number of
  174. details, which make their code less beautiful than expression code
  175. generation, but allows us to illustrate some important points. First,
  176. lets talk about code generation for prototypes: they are used both for
  177. function bodies and external function declarations. The code starts
  178. with:
  179. .. code-block:: ocaml
  180. let codegen_proto = function
  181. | Ast.Prototype (name, args) ->
  182. (* Make the function type: double(double,double) etc. *)
  183. let doubles = Array.make (Array.length args) double_type in
  184. let ft = function_type double_type doubles in
  185. let f =
  186. match lookup_function name the_module with
  187. This code packs a lot of power into a few lines. Note first that this
  188. function returns a "Function\*" instead of a "Value\*" (although at the
  189. moment they both are modeled by ``llvalue`` in ocaml). Because a
  190. "prototype" really talks about the external interface for a function
  191. (not the value computed by an expression), it makes sense for it to
  192. return the LLVM Function it corresponds to when codegen'd.
  193. The call to ``Llvm.function_type`` creates the ``Llvm.llvalue`` that
  194. should be used for a given Prototype. Since all function arguments in
  195. Kaleidoscope are of type double, the first line creates a vector of "N"
  196. LLVM double types. It then uses the ``Llvm.function_type`` method to
  197. create a function type that takes "N" doubles as arguments, returns one
  198. double as a result, and that is not vararg (that uses the function
  199. ``Llvm.var_arg_function_type``). Note that Types in LLVM are uniqued
  200. just like ``Constant``'s are, so you don't "new" a type, you "get" it.
  201. The final line above checks if the function has already been defined in
  202. ``Codegen.the_module``. If not, we will create it.
  203. .. code-block:: ocaml
  204. | None -> declare_function name ft the_module
  205. This indicates the type and name to use, as well as which module to
  206. insert into. By default we assume a function has
  207. ``Llvm.Linkage.ExternalLinkage``. "`external
  208. linkage <../LangRef.html#linkage>`_" means that the function may be defined
  209. outside the current module and/or that it is callable by functions
  210. outside the module. The "``name``" passed in is the name the user
  211. specified: this name is registered in "``Codegen.the_module``"s symbol
  212. table, which is used by the function call code above.
  213. In Kaleidoscope, I choose to allow redefinitions of functions in two
  214. cases: first, we want to allow 'extern'ing a function more than once, as
  215. long as the prototypes for the externs match (since all arguments have
  216. the same type, we just have to check that the number of arguments
  217. match). Second, we want to allow 'extern'ing a function and then
  218. defining a body for it. This is useful when defining mutually recursive
  219. functions.
  220. .. code-block:: ocaml
  221. (* If 'f' conflicted, there was already something named 'name'. If it
  222. * has a body, don't allow redefinition or reextern. *)
  223. | Some f ->
  224. (* If 'f' already has a body, reject this. *)
  225. if Array.length (basic_blocks f) == 0 then () else
  226. raise (Error "redefinition of function");
  227. (* If 'f' took a different number of arguments, reject. *)
  228. if Array.length (params f) == Array.length args then () else
  229. raise (Error "redefinition of function with different # args");
  230. f
  231. in
  232. In order to verify the logic above, we first check to see if the
  233. pre-existing function is "empty". In this case, empty means that it has
  234. no basic blocks in it, which means it has no body. If it has no body, it
  235. is a forward declaration. Since we don't allow anything after a full
  236. definition of the function, the code rejects this case. If the previous
  237. reference to a function was an 'extern', we simply verify that the
  238. number of arguments for that definition and this one match up. If not,
  239. we emit an error.
  240. .. code-block:: ocaml
  241. (* Set names for all arguments. *)
  242. Array.iteri (fun i a ->
  243. let n = args.(i) in
  244. set_value_name n a;
  245. Hashtbl.add named_values n a;
  246. ) (params f);
  247. f
  248. The last bit of code for prototypes loops over all of the arguments in
  249. the function, setting the name of the LLVM Argument objects to match,
  250. and registering the arguments in the ``Codegen.named_values`` map for
  251. future use by the ``Ast.Variable`` variant. Once this is set up, it
  252. returns the Function object to the caller. Note that we don't check for
  253. conflicting argument names here (e.g. "extern foo(a b a)"). Doing so
  254. would be very straight-forward with the mechanics we have already used
  255. above.
  256. .. code-block:: ocaml
  257. let codegen_func = function
  258. | Ast.Function (proto, body) ->
  259. Hashtbl.clear named_values;
  260. let the_function = codegen_proto proto in
  261. Code generation for function definitions starts out simply enough: we
  262. just codegen the prototype (Proto) and verify that it is ok. We then
  263. clear out the ``Codegen.named_values`` map to make sure that there isn't
  264. anything in it from the last function we compiled. Code generation of
  265. the prototype ensures that there is an LLVM Function object that is
  266. ready to go for us.
  267. .. code-block:: ocaml
  268. (* Create a new basic block to start insertion into. *)
  269. let bb = append_block context "entry" the_function in
  270. position_at_end bb builder;
  271. try
  272. let ret_val = codegen_expr body in
  273. Now we get to the point where the ``Codegen.builder`` is set up. The
  274. first line creates a new `basic
  275. block <http://en.wikipedia.org/wiki/Basic_block>`_ (named "entry"),
  276. which is inserted into ``the_function``. The second line then tells the
  277. builder that new instructions should be inserted into the end of the new
  278. basic block. Basic blocks in LLVM are an important part of functions
  279. that define the `Control Flow
  280. Graph <http://en.wikipedia.org/wiki/Control_flow_graph>`_. Since we
  281. don't have any control flow, our functions will only contain one block
  282. at this point. We'll fix this in `Chapter 5 <OCamlLangImpl5.html>`_ :).
  283. .. code-block:: ocaml
  284. let ret_val = codegen_expr body in
  285. (* Finish off the function. *)
  286. let _ = build_ret ret_val builder in
  287. (* Validate the generated code, checking for consistency. *)
  288. Llvm_analysis.assert_valid_function the_function;
  289. the_function
  290. Once the insertion point is set up, we call the ``Codegen.codegen_func``
  291. method for the root expression of the function. If no error happens,
  292. this emits code to compute the expression into the entry block and
  293. returns the value that was computed. Assuming no error, we then create
  294. an LLVM `ret instruction <../LangRef.html#ret-instruction>`_, which completes the
  295. function. Once the function is built, we call
  296. ``Llvm_analysis.assert_valid_function``, which is provided by LLVM. This
  297. function does a variety of consistency checks on the generated code, to
  298. determine if our compiler is doing everything right. Using this is
  299. important: it can catch a lot of bugs. Once the function is finished and
  300. validated, we return it.
  301. .. code-block:: ocaml
  302. with e ->
  303. delete_function the_function;
  304. raise e
  305. The only piece left here is handling of the error case. For simplicity,
  306. we handle this by merely deleting the function we produced with the
  307. ``Llvm.delete_function`` method. This allows the user to redefine a
  308. function that they incorrectly typed in before: if we didn't delete it,
  309. it would live in the symbol table, with a body, preventing future
  310. redefinition.
  311. This code does have a bug, though. Since the ``Codegen.codegen_proto``
  312. can return a previously defined forward declaration, our code can
  313. actually delete a forward declaration. There are a number of ways to fix
  314. this bug, see what you can come up with! Here is a testcase:
  315. ::
  316. extern foo(a b); # ok, defines foo.
  317. def foo(a b) c; # error, 'c' is invalid.
  318. def bar() foo(1, 2); # error, unknown function "foo"
  319. Driver Changes and Closing Thoughts
  320. ===================================
  321. For now, code generation to LLVM doesn't really get us much, except that
  322. we can look at the pretty IR calls. The sample code inserts calls to
  323. Codegen into the "``Toplevel.main_loop``", and then dumps out the LLVM
  324. IR. This gives a nice way to look at the LLVM IR for simple functions.
  325. For example:
  326. ::
  327. ready> 4+5;
  328. Read top-level expression:
  329. define double @""() {
  330. entry:
  331. %addtmp = fadd double 4.000000e+00, 5.000000e+00
  332. ret double %addtmp
  333. }
  334. Note how the parser turns the top-level expression into anonymous
  335. functions for us. This will be handy when we add `JIT
  336. support <OCamlLangImpl4.html#adding-a-jit-compiler>`_ in the next chapter. Also note that
  337. the code is very literally transcribed, no optimizations are being
  338. performed. We will `add
  339. optimizations <OCamlLangImpl4.html#trivial-constant-folding>`_ explicitly in the
  340. next chapter.
  341. ::
  342. ready> def foo(a b) a*a + 2*a*b + b*b;
  343. Read function definition:
  344. define double @foo(double %a, double %b) {
  345. entry:
  346. %multmp = fmul double %a, %a
  347. %multmp1 = fmul double 2.000000e+00, %a
  348. %multmp2 = fmul double %multmp1, %b
  349. %addtmp = fadd double %multmp, %multmp2
  350. %multmp3 = fmul double %b, %b
  351. %addtmp4 = fadd double %addtmp, %multmp3
  352. ret double %addtmp4
  353. }
  354. This shows some simple arithmetic. Notice the striking similarity to the
  355. LLVM builder calls that we use to create the instructions.
  356. ::
  357. ready> def bar(a) foo(a, 4.0) + bar(31337);
  358. Read function definition:
  359. define double @bar(double %a) {
  360. entry:
  361. %calltmp = call double @foo(double %a, double 4.000000e+00)
  362. %calltmp1 = call double @bar(double 3.133700e+04)
  363. %addtmp = fadd double %calltmp, %calltmp1
  364. ret double %addtmp
  365. }
  366. This shows some function calls. Note that this function will take a long
  367. time to execute if you call it. In the future we'll add conditional
  368. control flow to actually make recursion useful :).
  369. ::
  370. ready> extern cos(x);
  371. Read extern:
  372. declare double @cos(double)
  373. ready> cos(1.234);
  374. Read top-level expression:
  375. define double @""() {
  376. entry:
  377. %calltmp = call double @cos(double 1.234000e+00)
  378. ret double %calltmp
  379. }
  380. This shows an extern for the libm "cos" function, and a call to it.
  381. ::
  382. ready> ^D
  383. ; ModuleID = 'my cool jit'
  384. define double @""() {
  385. entry:
  386. %addtmp = fadd double 4.000000e+00, 5.000000e+00
  387. ret double %addtmp
  388. }
  389. define double @foo(double %a, double %b) {
  390. entry:
  391. %multmp = fmul double %a, %a
  392. %multmp1 = fmul double 2.000000e+00, %a
  393. %multmp2 = fmul double %multmp1, %b
  394. %addtmp = fadd double %multmp, %multmp2
  395. %multmp3 = fmul double %b, %b
  396. %addtmp4 = fadd double %addtmp, %multmp3
  397. ret double %addtmp4
  398. }
  399. define double @bar(double %a) {
  400. entry:
  401. %calltmp = call double @foo(double %a, double 4.000000e+00)
  402. %calltmp1 = call double @bar(double 3.133700e+04)
  403. %addtmp = fadd double %calltmp, %calltmp1
  404. ret double %addtmp
  405. }
  406. declare double @cos(double)
  407. define double @""() {
  408. entry:
  409. %calltmp = call double @cos(double 1.234000e+00)
  410. ret double %calltmp
  411. }
  412. When you quit the current demo, it dumps out the IR for the entire
  413. module generated. Here you can see the big picture with all the
  414. functions referencing each other.
  415. This wraps up the third chapter of the Kaleidoscope tutorial. Up next,
  416. we'll describe how to `add JIT codegen and optimizer
  417. support <OCamlLangImpl4.html>`_ to this so we can actually start running
  418. code!
  419. Full Code Listing
  420. =================
  421. Here is the complete code listing for our running example, enhanced with
  422. the LLVM code generator. Because this uses the LLVM libraries, we need
  423. to link them in. To do this, we use the
  424. `llvm-config <http://llvm.org/cmds/llvm-config.html>`_ tool to inform
  425. our makefile/command line about which options to use:
  426. .. code-block:: bash
  427. # Compile
  428. ocamlbuild toy.byte
  429. # Run
  430. ./toy.byte
  431. Here is the code:
  432. \_tags:
  433. ::
  434. <{lexer,parser}.ml>: use_camlp4, pp(camlp4of)
  435. <*.{byte,native}>: g++, use_llvm, use_llvm_analysis
  436. myocamlbuild.ml:
  437. .. code-block:: ocaml
  438. open Ocamlbuild_plugin;;
  439. ocaml_lib ~extern:true "llvm";;
  440. ocaml_lib ~extern:true "llvm_analysis";;
  441. flag ["link"; "ocaml"; "g++"] (S[A"-cc"; A"g++"]);;
  442. token.ml:
  443. .. code-block:: ocaml
  444. (*===----------------------------------------------------------------------===
  445. * Lexer Tokens
  446. *===----------------------------------------------------------------------===*)
  447. (* The lexer returns these 'Kwd' if it is an unknown character, otherwise one of
  448. * these others for known things. *)
  449. type token =
  450. (* commands *)
  451. | Def | Extern
  452. (* primary *)
  453. | Ident of string | Number of float
  454. (* unknown *)
  455. | Kwd of char
  456. lexer.ml:
  457. .. code-block:: ocaml
  458. (*===----------------------------------------------------------------------===
  459. * Lexer
  460. *===----------------------------------------------------------------------===*)
  461. let rec lex = parser
  462. (* Skip any whitespace. *)
  463. | [< ' (' ' | '\n' | '\r' | '\t'); stream >] -> lex stream
  464. (* identifier: [a-zA-Z][a-zA-Z0-9] *)
  465. | [< ' ('A' .. 'Z' | 'a' .. 'z' as c); stream >] ->
  466. let buffer = Buffer.create 1 in
  467. Buffer.add_char buffer c;
  468. lex_ident buffer stream
  469. (* number: [0-9.]+ *)
  470. | [< ' ('0' .. '9' as c); stream >] ->
  471. let buffer = Buffer.create 1 in
  472. Buffer.add_char buffer c;
  473. lex_number buffer stream
  474. (* Comment until end of line. *)
  475. | [< ' ('#'); stream >] ->
  476. lex_comment stream
  477. (* Otherwise, just return the character as its ascii value. *)
  478. | [< 'c; stream >] ->
  479. [< 'Token.Kwd c; lex stream >]
  480. (* end of stream. *)
  481. | [< >] -> [< >]
  482. and lex_number buffer = parser
  483. | [< ' ('0' .. '9' | '.' as c); stream >] ->
  484. Buffer.add_char buffer c;
  485. lex_number buffer stream
  486. | [< stream=lex >] ->
  487. [< 'Token.Number (float_of_string (Buffer.contents buffer)); stream >]
  488. and lex_ident buffer = parser
  489. | [< ' ('A' .. 'Z' | 'a' .. 'z' | '0' .. '9' as c); stream >] ->
  490. Buffer.add_char buffer c;
  491. lex_ident buffer stream
  492. | [< stream=lex >] ->
  493. match Buffer.contents buffer with
  494. | "def" -> [< 'Token.Def; stream >]
  495. | "extern" -> [< 'Token.Extern; stream >]
  496. | id -> [< 'Token.Ident id; stream >]
  497. and lex_comment = parser
  498. | [< ' ('\n'); stream=lex >] -> stream
  499. | [< 'c; e=lex_comment >] -> e
  500. | [< >] -> [< >]
  501. ast.ml:
  502. .. code-block:: ocaml
  503. (*===----------------------------------------------------------------------===
  504. * Abstract Syntax Tree (aka Parse Tree)
  505. *===----------------------------------------------------------------------===*)
  506. (* expr - Base type for all expression nodes. *)
  507. type expr =
  508. (* variant for numeric literals like "1.0". *)
  509. | Number of float
  510. (* variant for referencing a variable, like "a". *)
  511. | Variable of string
  512. (* variant for a binary operator. *)
  513. | Binary of char * expr * expr
  514. (* variant for function calls. *)
  515. | Call of string * expr array
  516. (* proto - This type represents the "prototype" for a function, which captures
  517. * its name, and its argument names (thus implicitly the number of arguments the
  518. * function takes). *)
  519. type proto = Prototype of string * string array
  520. (* func - This type represents a function definition itself. *)
  521. type func = Function of proto * expr
  522. parser.ml:
  523. .. code-block:: ocaml
  524. (*===---------------------------------------------------------------------===
  525. * Parser
  526. *===---------------------------------------------------------------------===*)
  527. (* binop_precedence - This holds the precedence for each binary operator that is
  528. * defined *)
  529. let binop_precedence:(char, int) Hashtbl.t = Hashtbl.create 10
  530. (* precedence - Get the precedence of the pending binary operator token. *)
  531. let precedence c = try Hashtbl.find binop_precedence c with Not_found -> -1
  532. (* primary
  533. * ::= identifier
  534. * ::= numberexpr
  535. * ::= parenexpr *)
  536. let rec parse_primary = parser
  537. (* numberexpr ::= number *)
  538. | [< 'Token.Number n >] -> Ast.Number n
  539. (* parenexpr ::= '(' expression ')' *)
  540. | [< 'Token.Kwd '('; e=parse_expr; 'Token.Kwd ')' ?? "expected ')'" >] -> e
  541. (* identifierexpr
  542. * ::= identifier
  543. * ::= identifier '(' argumentexpr ')' *)
  544. | [< 'Token.Ident id; stream >] ->
  545. let rec parse_args accumulator = parser
  546. | [< e=parse_expr; stream >] ->
  547. begin parser
  548. | [< 'Token.Kwd ','; e=parse_args (e :: accumulator) >] -> e
  549. | [< >] -> e :: accumulator
  550. end stream
  551. | [< >] -> accumulator
  552. in
  553. let rec parse_ident id = parser
  554. (* Call. *)
  555. | [< 'Token.Kwd '(';
  556. args=parse_args [];
  557. 'Token.Kwd ')' ?? "expected ')'">] ->
  558. Ast.Call (id, Array.of_list (List.rev args))
  559. (* Simple variable ref. *)
  560. | [< >] -> Ast.Variable id
  561. in
  562. parse_ident id stream
  563. | [< >] -> raise (Stream.Error "unknown token when expecting an expression.")
  564. (* binoprhs
  565. * ::= ('+' primary)* *)
  566. and parse_bin_rhs expr_prec lhs stream =
  567. match Stream.peek stream with
  568. (* If this is a binop, find its precedence. *)
  569. | Some (Token.Kwd c) when Hashtbl.mem binop_precedence c ->
  570. let token_prec = precedence c in
  571. (* If this is a binop that binds at least as tightly as the current binop,
  572. * consume it, otherwise we are done. *)
  573. if token_prec < expr_prec then lhs else begin
  574. (* Eat the binop. *)
  575. Stream.junk stream;
  576. (* Parse the primary expression after the binary operator. *)
  577. let rhs = parse_primary stream in
  578. (* Okay, we know this is a binop. *)
  579. let rhs =
  580. match Stream.peek stream with
  581. | Some (Token.Kwd c2) ->
  582. (* If BinOp binds less tightly with rhs than the operator after
  583. * rhs, let the pending operator take rhs as its lhs. *)
  584. let next_prec = precedence c2 in
  585. if token_prec < next_prec
  586. then parse_bin_rhs (token_prec + 1) rhs stream
  587. else rhs
  588. | _ -> rhs
  589. in
  590. (* Merge lhs/rhs. *)
  591. let lhs = Ast.Binary (c, lhs, rhs) in
  592. parse_bin_rhs expr_prec lhs stream
  593. end
  594. | _ -> lhs
  595. (* expression
  596. * ::= primary binoprhs *)
  597. and parse_expr = parser
  598. | [< lhs=parse_primary; stream >] -> parse_bin_rhs 0 lhs stream
  599. (* prototype
  600. * ::= id '(' id* ')' *)
  601. let parse_prototype =
  602. let rec parse_args accumulator = parser
  603. | [< 'Token.Ident id; e=parse_args (id::accumulator) >] -> e
  604. | [< >] -> accumulator
  605. in
  606. parser
  607. | [< 'Token.Ident id;
  608. 'Token.Kwd '(' ?? "expected '(' in prototype";
  609. args=parse_args [];
  610. 'Token.Kwd ')' ?? "expected ')' in prototype" >] ->
  611. (* success. *)
  612. Ast.Prototype (id, Array.of_list (List.rev args))
  613. | [< >] ->
  614. raise (Stream.Error "expected function name in prototype")
  615. (* definition ::= 'def' prototype expression *)
  616. let parse_definition = parser
  617. | [< 'Token.Def; p=parse_prototype; e=parse_expr >] ->
  618. Ast.Function (p, e)
  619. (* toplevelexpr ::= expression *)
  620. let parse_toplevel = parser
  621. | [< e=parse_expr >] ->
  622. (* Make an anonymous proto. *)
  623. Ast.Function (Ast.Prototype ("", [||]), e)
  624. (* external ::= 'extern' prototype *)
  625. let parse_extern = parser
  626. | [< 'Token.Extern; e=parse_prototype >] -> e
  627. codegen.ml:
  628. .. code-block:: ocaml
  629. (*===----------------------------------------------------------------------===
  630. * Code Generation
  631. *===----------------------------------------------------------------------===*)
  632. open Llvm
  633. exception Error of string
  634. let context = global_context ()
  635. let the_module = create_module context "my cool jit"
  636. let builder = builder context
  637. let named_values:(string, llvalue) Hashtbl.t = Hashtbl.create 10
  638. let double_type = double_type context
  639. let rec codegen_expr = function
  640. | Ast.Number n -> const_float double_type n
  641. | Ast.Variable name ->
  642. (try Hashtbl.find named_values name with
  643. | Not_found -> raise (Error "unknown variable name"))
  644. | Ast.Binary (op, lhs, rhs) ->
  645. let lhs_val = codegen_expr lhs in
  646. let rhs_val = codegen_expr rhs in
  647. begin
  648. match op with
  649. | '+' -> build_add lhs_val rhs_val "addtmp" builder
  650. | '-' -> build_sub lhs_val rhs_val "subtmp" builder
  651. | '*' -> build_mul lhs_val rhs_val "multmp" builder
  652. | '<' ->
  653. (* Convert bool 0/1 to double 0.0 or 1.0 *)
  654. let i = build_fcmp Fcmp.Ult lhs_val rhs_val "cmptmp" builder in
  655. build_uitofp i double_type "booltmp" builder
  656. | _ -> raise (Error "invalid binary operator")
  657. end
  658. | Ast.Call (callee, args) ->
  659. (* Look up the name in the module table. *)
  660. let callee =
  661. match lookup_function callee the_module with
  662. | Some callee -> callee
  663. | None -> raise (Error "unknown function referenced")
  664. in
  665. let params = params callee in
  666. (* If argument mismatch error. *)
  667. if Array.length params == Array.length args then () else
  668. raise (Error "incorrect # arguments passed");
  669. let args = Array.map codegen_expr args in
  670. build_call callee args "calltmp" builder
  671. let codegen_proto = function
  672. | Ast.Prototype (name, args) ->
  673. (* Make the function type: double(double,double) etc. *)
  674. let doubles = Array.make (Array.length args) double_type in
  675. let ft = function_type double_type doubles in
  676. let f =
  677. match lookup_function name the_module with
  678. | None -> declare_function name ft the_module
  679. (* If 'f' conflicted, there was already something named 'name'. If it
  680. * has a body, don't allow redefinition or reextern. *)
  681. | Some f ->
  682. (* If 'f' already has a body, reject this. *)
  683. if block_begin f <> At_end f then
  684. raise (Error "redefinition of function");
  685. (* If 'f' took a different number of arguments, reject. *)
  686. if element_type (type_of f) <> ft then
  687. raise (Error "redefinition of function with different # args");
  688. f
  689. in
  690. (* Set names for all arguments. *)
  691. Array.iteri (fun i a ->
  692. let n = args.(i) in
  693. set_value_name n a;
  694. Hashtbl.add named_values n a;
  695. ) (params f);
  696. f
  697. let codegen_func = function
  698. | Ast.Function (proto, body) ->
  699. Hashtbl.clear named_values;
  700. let the_function = codegen_proto proto in
  701. (* Create a new basic block to start insertion into. *)
  702. let bb = append_block context "entry" the_function in
  703. position_at_end bb builder;
  704. try
  705. let ret_val = codegen_expr body in
  706. (* Finish off the function. *)
  707. let _ = build_ret ret_val builder in
  708. (* Validate the generated code, checking for consistency. *)
  709. Llvm_analysis.assert_valid_function the_function;
  710. the_function
  711. with e ->
  712. delete_function the_function;
  713. raise e
  714. toplevel.ml:
  715. .. code-block:: ocaml
  716. (*===----------------------------------------------------------------------===
  717. * Top-Level parsing and JIT Driver
  718. *===----------------------------------------------------------------------===*)
  719. open Llvm
  720. (* top ::= definition | external | expression | ';' *)
  721. let rec main_loop stream =
  722. match Stream.peek stream with
  723. | None -> ()
  724. (* ignore top-level semicolons. *)
  725. | Some (Token.Kwd ';') ->
  726. Stream.junk stream;
  727. main_loop stream
  728. | Some token ->
  729. begin
  730. try match token with
  731. | Token.Def ->
  732. let e = Parser.parse_definition stream in
  733. print_endline "parsed a function definition.";
  734. dump_value (Codegen.codegen_func e);
  735. | Token.Extern ->
  736. let e = Parser.parse_extern stream in
  737. print_endline "parsed an extern.";
  738. dump_value (Codegen.codegen_proto e);
  739. | _ ->
  740. (* Evaluate a top-level expression into an anonymous function. *)
  741. let e = Parser.parse_toplevel stream in
  742. print_endline "parsed a top-level expr";
  743. dump_value (Codegen.codegen_func e);
  744. with Stream.Error s | Codegen.Error s ->
  745. (* Skip token for error recovery. *)
  746. Stream.junk stream;
  747. print_endline s;
  748. end;
  749. print_string "ready> "; flush stdout;
  750. main_loop stream
  751. toy.ml:
  752. .. code-block:: ocaml
  753. (*===----------------------------------------------------------------------===
  754. * Main driver code.
  755. *===----------------------------------------------------------------------===*)
  756. open Llvm
  757. let main () =
  758. (* Install standard binary operators.
  759. * 1 is the lowest precedence. *)
  760. Hashtbl.add Parser.binop_precedence '<' 10;
  761. Hashtbl.add Parser.binop_precedence '+' 20;
  762. Hashtbl.add Parser.binop_precedence '-' 20;
  763. Hashtbl.add Parser.binop_precedence '*' 40; (* highest. *)
  764. (* Prime the first token. *)
  765. print_string "ready> "; flush stdout;
  766. let stream = Lexer.lex (Stream.of_channel stdin) in
  767. (* Run the main "interpreter loop" now. *)
  768. Toplevel.main_loop stream;
  769. (* Print out all the generated code. *)
  770. dump_module Codegen.the_module
  771. ;;
  772. main ()
  773. `Next: Adding JIT and Optimizer Support <OCamlLangImpl4.html>`_