\documentclass[11pt]{article}
\usepackage[T1]{fontenc}
\usepackage{textcomp}
\usepackage{lmodern}
\usepackage[margin=1in]{geometry}
\usepackage[pdftex]{graphicx, color}
\usepackage{mathtools}
\usepackage{listings}
\usepackage[pdfusetitle]{hyperref}

\usepackage{tikz}
\usetikzlibrary{automata,positioning}

\renewcommand{\epsilon}{\varepsilon}
\newcommand{\tikzname}{Ti\emph{k}Z}
\tikzset{shorten >=1pt, node distance=2cm, on grid, baseline={([yshift=-8pt] current bounding box.north)}}


\lstset{basicstyle=\small\ttfamily,breaklines=true}

\title{CS143 Spring 2026 -- Written Assignment 1}

\begin{document}
\begin{center}
% Change this:
{\LARGE{CS143 Spring 2026 -- Written Assignment 1}} \\
{\large Due Thursday, April 16, 2026 11:59 PM PDT}
\end{center}

This assignment covers regular languages, finite automata, and lexical analysis. You may discuss this assignment with other students and work on the problems together. However, your write-up should be your own individual work, and you should indicate in your submission who you worked with, if applicable. Assignments can be submitted electronically through Gradescope as a \textsc{pdf} by 11:59 \textsc{pm pdt} on the due date. Please review the course policies for more information: \url{http://web.stanford.edu/class/cs143/policies/index.html}. A \LaTeX{} template for writing your solutions is available on the course website. To create finite automata diagrams, you can either use the \tikzname{} package directly by following the examples in the template, or a tool like \url{https://madebyevan.com/fsm/}.

\begin{enumerate}
% problem 1
\item Write regular expressions \textit{and} DFAs that recognize the following languages over the alphabet $\Sigma = \{0, 1\}$. Your DFAs must contain no more than 4 states.
\begin{enumerate}
    \item The set of strings that end with 110. \\

    \textbf{Solution:} 

    Regular expression:
    \[ \]

    DFA:
    
    For your convenience, below is a simple LaTeX code example to demonstrate how you can draw nodes, edges, initial state and accepting states in the \texttt{tikz} environment.
    \begin{center}
    \begin{tikzpicture}[auto]
        \node[state, initial, accepting] (q0) {};
        \node[state] (q1) [right of=q0] {};
        \node[state] (q2) [right of=q1] {};
        
        \path[->]
            (q0) edge node[swap] {$0,1$} (q1)
            (q0) edge[loop above] node {$0,1$} (q0)
            (q1) edge[bend right=20] node[swap] {$0,1$} (q0)
            (q1) edge node {$0,1$} (q2)
            ;
    \end{tikzpicture}
    \end{center}

    \

    \item The set of strings that contain less than three 0's. \\

    \textbf{Solution:} 
    
    Regular expression:
    \[ \]

    DFA:
  

    \newpage
    
    \item The set of strings that do not contain two or more consequent 0's.\\

    \textbf{Solution:} 
    
    Regular expression:
    \[ \]

    DFA:

    \vspace{5em}

    \ 

    \item The set of strings that, when interpreted as a binary number, is a multiple of 3 (as an edge case, the empty string shall be interpreted as number 0, which is a multiple of 3). \\
    \textbf{Hint:} For this subproblem, it might be easier to draw the DFA first, and then construct the regular expression by considering what paths are possible in the DFA to reach the accept state.\\

    \textbf{Solution:} 

    DFA:

    \vspace{5em}

    Regular expression:
    \[ \]

\end{enumerate}


\newpage


% Problem 4
\item Write NFAs that recognize the following languages over the alphabet $\Sigma = \{a, b, c\}$. 
\begin{enumerate}
\item The language $L$ where a string $w$ is in $L$ iff there exists $i<j$ such that $w[i]=$\texttt{`a'} and $w[j]=$\texttt{`b'}. For example, \texttt{ab}, \texttt{bacb} are in $L$, where \texttt{ba}, \texttt{ac} are not. Your NFA should have no more than 3 states.

\textbf{Solution:}

\vspace{5em}

\item The language $L$ where a string $w$ is in $L$ iff some character $x\in \{a, b, c\}$ appears an odd number of times in $w$. For example, \texttt{ab} is in $L$, but \texttt{aa} is not.
Your NFA should have no more than 8 states.

\textbf{Solution:}


\end{enumerate}

\newpage

% Problem 3
\item Using the techniques covered in class, transform the following NFAs over the alphabet $\{a, b, c\}$ into DFAs. Your DFAs should not have more than 10 states.  Note that a DFA must have a transition defined for every state and symbol pair, whereas a NFA need not. You must take this fact into account for your transformations. Hint: Is there a subset of states the NFA transitions to when fed a symbol for which the set of current states has no explicit transition?

Also include a mapping from each state of your DFA to the corresponding states of the original NFA.  Specifically, a state $q$ of your DFA maps to the set of states $Q$ of the NFA such that an input string stops at $q$ in the DFA if and only if it stops at one of the states in $Q$ in the NFA.

Tip: for readability, states in the DFA may be labeled according to the set of states they represent in the NFA.  For example, state $q_{012}$ in the DFA would correspond to the set of states $\{q_0, q_1, q_2\}$ in the NFA, whereas state $q_{13}$ would correspond to set of states $\{q_1, q_3\}$ in the NFA.

\begin{enumerate}
    \item \begin{tikzpicture}[auto]
        \node[state,initial] (q_0) {$q_0$};
        \node[state,accepting] (q_1) [right of=q_0] {$q_1$};
        \path[->]
            (q_0) edge[loop above]   node {$c$}   (q_0)
            (q_0) edge[bend left=20] node {$b,c$} (q_1)
            (q_1) edge[bend left=20] node {$a$}   (q_0)
            (q_1) edge[loop above]   node {$c$}   (q_1);
    \end{tikzpicture}

    \textbf{Solution:}

    \newpage
    
    \item \begin{tikzpicture}[auto]
        \node[state,initial]   (q_0)                      {$q_0$};
        \node[state]           (q_1) [above right of=q_0] {$q_1$};
        \node[state,accepting] (q_2) [below right of=q_1] {$q_2$};
        \path[->]
            (q_0) edge[loop below] node {$b,c$} (q_0)
                  edge             node {$b$}   (q_1)
            (q_1) edge             node {$c$}   (q_2)
            (q_1) edge[loop above] node {$c$}   (q_1)
            (q_2) edge             node {$a$}   (q_0)
            (q_2) edge[loop right] node {$a$}   (q_2);
    \end{tikzpicture}

    \textbf{Solution:}

    \newpage
    
    \item \begin{tikzpicture}[auto]
        \node[state, initial] (q_0) {$q_0$};
        \node[state] (q_2) [right of=q_0] {$q_2$};
        \node[state] (q_3) [below of=q_2] {$q_3$};
        \node[state] (q_1) [above of=q_2] {$q_1$};
        \node[state, accepting] (q_4) [right of=q_2] {$q_4$};

        \path[->]
            (q_0) edge node {$a$} (q_1)
            (q_0) edge node {$b$} (q_2)
            (q_0) edge node {$c$} (q_3)
            (q_1) edge node {$\epsilon$} (q_2)
            (q_2) edge node {$\epsilon$} (q_3)
            (q_3) edge[bend left=30] node {$\epsilon$} (q_0)
            (q_1) edge node {$a$} (q_4)
            (q_2) edge node {$b$} (q_4)
            (q_3) edge node {$c$} (q_4);
        
    \end{tikzpicture}

    \textbf{Solution:}
    
\end{enumerate}

\newpage

% Problem 5
\item Consider the following tokens and their associated regular expressions, given as a \textbf{flex} scanner specification:
\begin{quote}
\begin{lstlisting}
%%
0                         printf("a");
(10|01)                   printf("b");
(101|011+)                printf("c");
\end{lstlisting}
\end{quote}
Give an input to this scanner such that the output lexeme sequence is \texttt{aacbbca}. 

For ease of grading, please use underlines to show the parts of your input string that correspond to each lexeme, for example, \underline{10}\,\underline{0}\,\underline{0111}.

\textbf{Solution:}


\newpage

% Problem 6
\item Recall from the lecture that, when using regular expressions to scan an input, we resolve conflicts by taking the largest possible match at any point. That is, if we have the following \textbf{flex} scanner specification:
\begin{quote}
\begin{lstlisting}
%%
do                      { return T_Do; }
[A-Za-z_][A-Za-z0-9_]*  { return T_Identifier; }
\end{lstlisting}
\end{quote}
and we see the input string ``\texttt{dot}'', we will match the second rule and emit T\_Identifier for the whole string, not T\_Do.

However, it is possible to have a set of regular expressions for which we can tokenize a particular string, but for which taking the largest possible match will fail to break the input into tokens. Give an example of no more than two regular expressions and an input string such that: a) the string can be broken into substrings, where each substring matches one of the regular expressions, b) our usual lexer algorithm, taking the largest match at every step, will fail to break the string in a way in which each piece matches one of the regular expressions. Explain how the string can be tokenized and why taking the largest match won't work in this case.

As a challenge (not necessary for credit), try to find a solution that only uses one regular expression.

\textbf{Solution}: 

\end{enumerate}
\end{document}

