How to do overlaying and solve line drawing problem in this TikZ example?

1) Draw the connecting lines outside the node.

2) You can use layers to put the cluster node in the foreground layer, while all the rest is in the main layer.

\documentclass[a4paper, 11pt]{article}

% Load packages
\usepackage{tikz}
\usepackage[sc,osf]{mathpazo}

% Load TikZ libraries
\usetikzlibrary{shapes,arrows}
\usetikzlibrary{fit}
\usetikzlibrary{backgrounds}
\usetikzlibrary{positioning}
\usetikzlibrary{calc}

% Define colors
\definecolor{lightGrey}{rgb}{0.9, 0.9, 0.9}
\definecolor{shade0}{named}{white}
\definecolor{shade1}{named}{lightGrey}
\definecolor{shade2}{rgb}{0.8, 0.8, 0.8}
\definecolor{shade3}{rgb}{0.65, 0.65, 0.65}
\definecolor{shade4}{rgb}{0.45, 0.45, 0.45}

% Text settings
\newcommand{\figureTextSize}{\tiny}

% Figure element lengths
\newlength{\gpgpuElemSize}
\setlength{\gpgpuElemSize}{8mm}
\newlength{\gpgpuElemSep}
\setlength{\gpgpuElemSep}{1mm}

% TikZ styles
\newcommand{\arrowStyle}{stealth}
\\
newcommand{\lineThickness}{semithick}
\tikzstyle{box} = [%
  draw,
  rectangle,
  \lineThickness,
]

\begin{document}
\pgfdeclarelayer{background}
\pgfdeclarelayer{foreground}
\pgfsetlayers{background,main,foreground}

\begingroup
\figureTextSize
\newcommand{\drawSM}{%
  \begin{tikzpicture}[remember picture]
    % SPs
    \foreach \y in {1, ..., 4} {%
      \foreach \x in {1, ..., 2} {%
        \ifnum \x=1
          \ifnum \y=1
            \node [sp] (sp\y-\x) {SP};
          \else
            \pgfmathtruncatemacro\prevY{\y-1}
            \node [sp, below=of sp\prevY-\x] (sp\y-\x) {SP};
          \fi
        \else
          \pgfmathtruncatemacro\prevX{\x-1}
          \node [sp, right=of sp\y-\prevX] (sp\y-\x) {SP};
        \fi
      }
    }

    % SFUs
    \foreach \i in {1, ..., 2} {%
      \ifnum \i=1
        \path let \p1 = (sp1-1.south west),
                  \p2 = (sp1-2.north east)
               in
                 node [%
                   sfu,
                   minimum width=\x2-\x1-\pgflinewidth,
                   below right,
                 ] at ([%
                   yshift={-0.5\gpgpuElemSep},
                 ] sp4-1.south west) (sfu\i) {SFU};
      \else
          \pgfmathtruncatemacro\prevI{\i-1}
        \path let \p1 = (sp1-1.south west),
                  \p2 = (sp1-2.north east)
               in
                 node [%
                   sfu,
                   minimum width=\x2-\x1-\pgflinewidth,
                   below=of sfu\prevI,
                 ] (sfu\i) {SFU};
      \fi
    }

    % Fetch/Dispatch
    \node [issuer, right=of sp1-2] (issuer) {Fetch/Dispatch};

    % Register file
    \node [register, below=of issuer] (register) {Registers};

    % Instruction cache
    \node [cache, below=of register] (icache) {Inst. cache};

        % Constant cach
    \node [cache, below=of icache] (ccache) {Constant cache};

    % Shared memory
    \path let \p1 = (sfu2.south west),
              \p2 = (sfu1.north east),
              \p3 = (ccache.south west),
              \p4 = (ccache.north east)
           in
             node [%
               memory,
               minimum width=\x4-\x3-\pgflinewidth,
               minimum height=\y2-\y1-\pgflinewidth,
               below=of ccache,
             ] (sharedmem) {%
               \begin{tabular}{c}
                 Shared \\
                 memory
               \end{tabular}
             };
  \end{tikzpicture}
}
\begin{tikzpicture}[%
    auto,
    >=\arrowStyle,
    remember picture,
    every node/.style={%
      node distance=0.5\gpgpuElemSep,
    },
    splitted/.style={%
      rectangle split,
      rectangle split parts=2,
      rectangle split draw splits=false,
    },
    gpgpu/.style={%
      box,
      fill=shade2,
      splitted,
    },
    gpgpu-part/.style={%
      box,
      fill=shade3,
      minimum size=10mm,
    },
    dram/.style={%
      box,
      fill=shade1,
      minimum width=8mm,
      minimum height=5mm,
      node distance=\gpgpuElemSep,
    },
    connection/.style={%
      <->,
      \lineThickness,
    },
    cluster/.style={%
      box,
      fill=shade3,
      splitted,
    },
    sm/.style={%
      cluster,
      fill=shade0,
    },
    sp/.style={%
      box,
      fill=shade1,
      minimum size=0.5\gpgpuElemSize,
      inner sep=0pt,
    },
    sfu/.style={%
      sp,
    },
    register/.style={%
      box,
      fill=shade2,
      minimum height=0.5\gpgpuElemSize,
      minimum width=2\gpgpuElemSize,
      inner sep=0pt,
    },
    cache/.style={%
      register,
    },
    memory/.style={%
      register,
    },
    issuer/.style={%
      register,
      fill=shade3,
    },
    zoom-line/.style={%
      \lineThickness,
      dashed,
    },
  ]
  \node (all-parts) {%
    \begin{tikzpicture}[remember picture]
      % GPGPU - zoomed out
      \node [gpgpu] (gpgpu) {%
        GPGPU
        \nodepart{two}
        \begin{tikzpicture}[remember picture]
          % Clusters
          \foreach \y in {1, ..., 3} {%
            \foreach \x in {1, ..., 4} {%
              \ifnum \x=1
                \ifnum \y=1
                  \node [gpgpu-part] (gp\y-\x) {};
                \else
                  \pgfmathtruncatemacro\prevY{\y-1}
                  \node [gpgpu-part, below=of gp\prevY-\x] (gp\y-\x) {};
                \fi
              \else
                \pgfmathtruncatemacro\prevX{\x-1}
                \node [gpgpu-part, right=of gp\y-\prevX] (gp\y-\x) {};
              \fi
            }
          }
        \end{tikzpicture}
      };

      % DRAMs
      \node [below=of gpgpu, yshift=-2\gpgpuElemSep] {%
        \begin{tikzpicture}[remember picture]
          \foreach \i in {1, ..., 5} {%
            \ifnum \i=1
              \node [dram] (dram\i) {DRAM};
            \else
              \pgfmathtruncatemacro\prevI{\i-1}
              \node [dram, right=of dram\prevI] (dram\i) {DRAM};
            \fi
          }

        \end{tikzpicture}
      };
    \end{tikzpicture}
  };

  % Connect DRAM with GPGPU
  \foreach \i in {1, ..., 5} %
    \draw [connection] (dram\i) -- (dram\i |- gpgpu.south);

  % Cluster - in detail
 \begin{pgfonlayer}{foreground}
  \node [cluster, right=of all-parts, xshift=4\gpgpuElemSep] (cluster-big) {%
    Cluster
    \nodepart{two}
    \begin{tikzpicture}[remember picture]
      % SMs
      \node [sm] (sm1)               {SM \nodepart{two}\drawSM};
      \node [sm, right=of sm1] (sm2) {SM \nodepart{two}\drawSM};
      % Texture memory
      \path let \p1 = (sm1.south west),
                \p2 = (sm2.north east)
             in
               node [%
                 memory,
                 minimum width=\x2-\x1-\pgflinewidth,
                 below right,
               ] at ([%
                 yshift={-0.5\gpgpuElemSep},
               ] sm1.south west) (texturemem) {Texture memory};
    \end{tikzpicture}
  };
  \end{pgfonlayer} 

  % Zoom lines
  \draw [zoom-line] (gp2-4.north west) -- (cluster-big.north west);
  \draw [zoom-line] (gp2-4.north east) -- (cluster-big.north east);
  \draw [zoom-line] (gp2-4.south west) -- (cluster-big.south west);
  \draw [zoom-line] (gp2-4.south east) -- (cluster-big.south east);

\end{tikzpicture}
\endgroup

\end{document}

enter image description here


Remember that TeX parses your .tex document from the one line at a time. As such, the sequence in which you provide the commands in your tikzpicture will provide a hierarchy in which the elements of the drawing is eventually typeset. To that extent, in answer to your second question (and without correcting the provided code), you need to move the Zoom lines to before the Cluster (between GPGPU and Cluster). This may require you to recode some (all?) of the Cluster or use a different technique to obtain cluster-big.<location> nodes.

Another, less elegant way to do this, is to typeset the entire Cluster twice. The second time after you've printed the Zoom lines.