ECE 532 Final Design Report: Real-time High Dynamic Range ...pc/courses/432/2012... · Image...
Transcript of ECE 532 Final Design Report: Real-time High Dynamic Range ...pc/courses/432/2012... · Image...
![Page 1: ECE 532 Final Design Report: Real-time High Dynamic Range ...pc/courses/432/2012... · Image processing done in pipeline rather than parallel. HDR image is fed to processing afterwards](https://reader034.fdocuments.us/reader034/viewer/2022050311/5f72c32f78fb2c3026215343/html5/thumbnails/1.jpg)
ECE 532 – Final Design Report:
Real-time High Dynamic Range Video Processing
David Dai
Calvin Thomason Ngan
Tao Ai
April 9th
, 2012
![Page 2: ECE 532 Final Design Report: Real-time High Dynamic Range ...pc/courses/432/2012... · Image processing done in pipeline rather than parallel. HDR image is fed to processing afterwards](https://reader034.fdocuments.us/reader034/viewer/2022050311/5f72c32f78fb2c3026215343/html5/thumbnails/2.jpg)
Table of Contents 1 – Overview ..............................................................................................................................................................3
1.1 Motivation ........................................................................................................................................................3
1.2 Goal ..................................................................................................................................................................3
1.3 Function Requirements .....................................................................................................................................3
1.4 Features ............................................................................................................................................................3
1.5 Equipment ........................................................................................................................................................3
1.6 Block Diagram..................................................................................................................................................4
1.7 Brief Description of IP .....................................................................................................................................7
2 - Outcome ................................................................................................................................................................9
2.1 Outcome Overview ...........................................................................................................................................9
2.2 Possible Further Improvements ......................................................................................................................10
3 - Project Schedule ..................................................................................................................................................10
4 - Description of the Blocks ....................................................................................................................................11
4.1 dvi_decoder ....................................................................................................................................................11
4.2 rgb_tdp_ram_32x2048 ...................................................................................................................................12
4.3 Memory Write Controller ...............................................................................................................................12
4.4 Memory Controller Block ..............................................................................................................................12
4.5 Memory Read Controller ................................................................................................................................12
4.6 rgb_lut_no_li ..................................................................................................................................................13
4.7 HDR Composite Block: composite_top .........................................................................................................13
4.8 Window Provision Block: framex_line_buffers .............................................................................................14
4.9 Convolution Block: conv5x5 ..........................................................................................................................15
4.10 dvi_encoder ..................................................................................................................................................16
Project Tree: .............................................................................................................................................................16
Reference ..................................................................................................................................................................18
Appendix A ..............................................................................................................................................................19
Figure 1 .................................................................................................................................................................19
Figure 2 .................................................................................................................................................................19
Figure 3 .................................................................................................................................................................20
Figure 4 .................................................................................................................................................................20
Figure 5 .................................................................................................................................................................21
Figure 6 .................................................................................................................................................................21
Appendix B ...............................................................................................................................................................22
![Page 3: ECE 532 Final Design Report: Real-time High Dynamic Range ...pc/courses/432/2012... · Image processing done in pipeline rather than parallel. HDR image is fed to processing afterwards](https://reader034.fdocuments.us/reader034/viewer/2022050311/5f72c32f78fb2c3026215343/html5/thumbnails/3.jpg)
1 – Overview
1.1 Motivation Human eyes have limited dynamic range, meaning people are not able to perceive everything in an
environment at any light setting. For example human have trouble seeing the fuse of a bright light bulb.
At the same time, image capturing devices also have limited dynamic range. For example, a photo of a
person with scenery and sun behind him is taken. An underexposed image has the person appears very
dark while the scenery appears normal. On the other hand, in an overexposed image the scenery appears
very white while the person seems normal. Therefore the team attempts to develop a design to create a
high dynamic range video with a camera which produces clear and detailed video regardless of the
lighting setting. This product will improve people’s vision. For example, one could look through the
capturing device at a wielding process.
1.2 Goal
The goal of the design is to create high dynamic range (HDR) video in real-time using a FPGA. The
FPGA receives HDMI video stream from a camera, processes the video and outputs HDMI video to a
monitor. The processing includes constructing a HDR video frame by combining multiple frames of
different exposures from a camera, then post-processing resulting frame to enhance its quality. The post
processing includes a Gaussian convolution for blurring, edge detection and tonal mapping.
1.3 Function Requirements
The following are the functional requirements of the project.
Process alternating frames with distinct exposures in real time to generate a single image with
higher dynamic range.
Extensive image processing through convolution, edge detection and tone mapping for higher
picture quality
Input at 120 FPS at 720x480 resolution and 60 FPS in real time
Simultaneous writing (1 port) and reading (2 ports) from DDR2 Memory.
Image processing done in pipeline rather than parallel. HDR image is fed to processing
afterwards
1.4 Features
The following are the features of the design
Input at 120 FPS at 720x480 resolution and 60 FPS in real time
Image shown has higher dynamic range detail than before.
Video is enhanced through several image processing and filtering techniques
1.5 Equipment
The following are equipment used for the design and their properties.
1. Atlys [1]
![Page 4: ECE 532 Final Design Report: Real-time High Dynamic Range ...pc/courses/432/2012... · Image processing done in pipeline rather than parallel. HDR image is fed to processing afterwards](https://reader034.fdocuments.us/reader034/viewer/2022050311/5f72c32f78fb2c3026215343/html5/thumbnails/4.jpg)
FPGA family: Spartan-6
FPGA part: xc6slx45-csg325
Speed grade: -2
2. Camera
Has HDMI output
Firmware is hacked to provide frames with alternating exposure
Provides 480p (720x480) video at pixel clock of 27MHz.
3. Monitor
Has DVI or HDMI input port
Its EDID must be recognisable by the camera after passing through the FPGA
1.6 Block Diagram
The following diagram provides the data path of the designs.
Board
FPGA
dvi_decoder
HDMI decoder
CameraCamera
MonitorMonitor
rgb_tdp_ram_32
x2048
Circular line
buffers (BRAM)
Memory write
controller
mcb
Memory
Controller
Block for
DDR2
rgb_tdp_ram_32
x2048
Circular line
buffers (BRAM)
rgb_tdp_ram_32
x2048
Circular line
buffers (BRAM)
dvi_encoder
HDMI encoder
fraemx_line_buffers
rgb_post_ram_48x512
Line buffer 1 (BRAM)
rgb_post_ram_48x512
Line buffer 3 (BRAM)
rgb_post_ram_48x512
Line buffer 2 (BRAM)
rgb_post_ram_48x512
Line buffer 4 (BRAM)
rgb_post_ram_48x512
Line buffer 5 (BRAM)
Memory read
controller
conv5x5
Gaussian
convolution
Tone mapping
Edge detection
DDR2 SDRAM
memory
HDMI IN
Input port
HDMI OUT
Output port
Switches
On board switches
Switch
Select output
Module name
description
Module name
Xilinx existing IP
Legend
External IP
Custom IPModule name
rgb_lut_no_li
rgb_r8stops_lut
Red look up table
(preloaded BRAM)
rgb_g8stops_lut
Blue look up table
(preloaded BRAM)
rgb_b8stops_lut
Green look up table
(preloaded BRAM)
Figure 1: Final Datapath containing original implementation with Post Processing
![Page 5: ECE 532 Final Design Report: Real-time High Dynamic Range ...pc/courses/432/2012... · Image processing done in pipeline rather than parallel. HDR image is fed to processing afterwards](https://reader034.fdocuments.us/reader034/viewer/2022050311/5f72c32f78fb2c3026215343/html5/thumbnails/5.jpg)
Board
FPGA
dvi_decoder
HDMI decoder
CameraCamera
MonitorMonitor
rgb_tdp_ram_32
x2048
Circular line
buffers (BRAM)
Memory write
controller
mcb
Memory
Controller
Block for
DDR2
rgb_tdp_ram_32
x2048
Circular line
buffers (BRAM)
rgb_tdp_ram_32
x2048
Circular line
buffers (BRAM)
dvi_encoder
HDMI encoder
rgb_lut_no_li
rgb_r8stops_lut
Red look up table
(preloaded BRAM)
rgb_g8stops_lut
Blue look up table
(preloaded BRAM)
rgb_b8stops_lut
Green look up table
(preloaded BRAM)
Memory read
controller
DDR2 SDRAM
memory
HDMI IN
Input port
HDMI OUT
Output port
Module name
description
Module name
Xilinx existing IP
Legend
External IP
Custom IPModule name
Figure 2: Original implementation with only original HDR processing
![Page 6: ECE 532 Final Design Report: Real-time High Dynamic Range ...pc/courses/432/2012... · Image processing done in pipeline rather than parallel. HDR image is fed to processing afterwards](https://reader034.fdocuments.us/reader034/viewer/2022050311/5f72c32f78fb2c3026215343/html5/thumbnails/6.jpg)
Board
FPGA
dvi_decoder
HDMI decoder
CameraCamera
MonitorMonitor
rgb_tdp_ram_32
x2048
Circular line
buffers (BRAM)
Memory write
controller
mcb
Memory
Controller
Block for
DDR2
rgb_tdp_ram_32
x2048
Circular line
buffers (BRAM)
rgb_tdp_ram_32
x2048
Circular line
buffers (BRAM)
dvi_encoder
HDMI encoder
fraemx_line_buffers
rgb_post_ram_48x720
Line buffer 1 (BRAM)
rgb_post_ram_48x720
Line buffer 3 (BRAM)
rgb_post_ram_48x720
Line buffer 2 (BRAM)
rgb_post_ram_48x720
Line buffer 4 (BRAM)
rgb_post_ram_48x720
Line buffer 5 (BRAM)
Memory read
controller
composite_topconv5x5
Gaussian
convolution
Tone mapping
Edge detection
DDR2 SDRAM
memory
HDMI IN
Input port
HDMI OUT
Output port
Switches
On board switches
Switch
Select output
lut_rw_lsh27
Red look up table
(BRAM)
lut_gw_lsh27
Green look up
table (BRAM)
lut_w_lsh17
Weight look up
table (BRAM)
lut_w_lsh27b
Blue look up table
(BRAM)
Module name
description
Module name
Xilinx existing IP
Legend
External IP
Custom IPModule name
divider_core
Figure 3: Data path containing new HDR composition(Not working) and post processing
![Page 7: ECE 532 Final Design Report: Real-time High Dynamic Range ...pc/courses/432/2012... · Image processing done in pipeline rather than parallel. HDR image is fed to processing afterwards](https://reader034.fdocuments.us/reader034/viewer/2022050311/5f72c32f78fb2c3026215343/html5/thumbnails/7.jpg)
1.7 Brief Description of IP
On chip (FPGA)
Module name Origin Description
dvi_decoder Xilinx Turns HDMI input video into 8-bit red, green and
blue values and provides pixel clock, data valid,
HSYNC and VSYNC.
rgb_tdp_ram_32x2048 Xilinx BRAM with 32-bit wide and 2048 deep used as
circular line buffers
Memory Write Controller External
(grad student)
Controls the write enable and write address to store
incoming lines of a frame into a circular buffer. It
also controls the address for reading out of the
buffer as well as command instructions, burst
length, address and write enable for MCB to store
pixels into DDR2.
MCB Xilinx,
modified
Memory Controller Block for DDR2. MCB is
configured to have 6 32-bit ports. Only one port is
use for write and two are used for read. The PLL
inside MCB is modified to provide a 625MHz
clock.
Memory Read Controller Custom Controls the command instructions, burst length,
address and read enable for 2 ports of MCB to read
out pixels of two video frames from DDR2 at the
same time. It also control the write enable and write
address to store these pixel into 2 separate circular
line buffers (BRAM).
rgb_lut_no_li External
(grad student)
Concatenates the pixel values of 2 frames to
produce an address for the look up table (BRAM).
This is done separately for red, green and blue
channel.
Sub-modules rgb_r8stops_lut Xilinx BRAM is preloaded with a look up table to
combine two frames into one for the red channel of
pixels
rgb_g8stops_lut BRAM is preloaded with a look up table to
combine two frames into one for the green channel
of pixels
rgb_b8stops_lut BRAM is preloaded with a look up table to
combine two frames into one for the blue channel
of pixels
composite_top Custom Combines a pixel from 2 different frames using a
response function and weighted function to produce
a HDR pixel
Sub-modules lut_rw_lsh27 Xilinx BRAM is preloaded with a look up table to output
the left shifted product of the camera response
function and certainty function for red pixel channel
lut_gw_lsh27 BRAM is preloaded with a look up table to output
the left shifted product of the camera response
![Page 8: ECE 532 Final Design Report: Real-time High Dynamic Range ...pc/courses/432/2012... · Image processing done in pipeline rather than parallel. HDR image is fed to processing afterwards](https://reader034.fdocuments.us/reader034/viewer/2022050311/5f72c32f78fb2c3026215343/html5/thumbnails/8.jpg)
function and certainty function for green pixel
channel
lut_bw_lsh27 BRAM is preloaded with a look up table to output
the left shifted product of the camera response
function and certainty function for blue pixel
channel
lut_w_lsh17 BRAM is preloaded with a look up table to output
the left shifted certainty function for red, green and
blue channel of a pixel
divider_core External
(opencores)
An divider IP core from opencores.org is used
instead of the Xilinx IP because of its simplicity
and
framex_line_buffers Custom Control for 5 line buffers (BRAM) that store 5
different lines of a frame at any given time.
It also constructs a 5x5 window (matrix) of pixels
for convolution. The window shifts at every
positive clock edge.
Sub-modules rgb_post_ram_4
8x720
BRAM A 48-bit wide and 720 deep BRAM to store a line
of a frame. There are five of them to store 5 lines.
conv5x5 Custom Module that contains Gaussian convolution, edge
detection and tone mapping.
Sub-modules Gaussian
convolution
Custom Performs a 2D Gaussian convolution using the 5x5
window provided by framex_line_buffers to
produce one pixel. A new pixel is generated at
every positive clock edge.
Edge detection External
(grad student)
Emphasis the edges of a frame by subtracting
convoluted pixels from the original pixels.
Tone mapping External
(grad student)
A calculation that converts high dynamic range
pixels into low dynamic range pixels while keeping
their contrasts
dvi_encoder Xilinx Takes 8-bit red, green and blue values and output a
HDMI stream to the board.
On board
DDR2 SDRAM Xilinx Micron MT47H64M16-25E providing 16-bit bus
and 64M location. Operates up to an 800MHz data
rate. [1]
switches Xilinx Switch [7:0] selects different output: HDR image,
blurred image, edge enhanced image, or tone
mapped image.
HDMI IN Xilinx Input port for HDMI
HDMI OUT Xilinx Output port for HDMI
External
Camera Borrowed Camera is used to provide 480p HDMI input to
FPGA at 120fps
Monitor Borrowed Monitor with DVI or HDMI input ports
![Page 9: ECE 532 Final Design Report: Real-time High Dynamic Range ...pc/courses/432/2012... · Image processing done in pipeline rather than parallel. HDR image is fed to processing afterwards](https://reader034.fdocuments.us/reader034/viewer/2022050311/5f72c32f78fb2c3026215343/html5/thumbnails/9.jpg)
2 - Outcome
2.1 Outcome Overview
We were able to implement all of our desired features and components and go above and beyond our
original implementation. Due to that fact that we had limited information about the topic at hand, the
initial proposal was somewhat vague and ambiguous. However the resulting project shows solid features
with careful design.
Original
Functional
Requirements
Final Functional
Requirements
Justification and Description
Process two
images of different
exposures
Process alternating frames
with distinct exposures in
real time to generate a
single image with higher
dynamic range.
At the early stages of design, the specs for the
cameras we used were not finalized. Despite using a
low power FPGA and limited resources, an improved
feature was achieved through efficient
implementation.
Filter Image for
better picture
Extensive image processing
through convolution, edge
detection and tone mapping
for higher picture quality
Limited in logic and DSP slices, multiplications done
through shifts for approx results and lowers resource
usage. Again feature has dramatically improved from
the beginning.
Able to Input and
Output Video
through HDMI
Input at 120 FPS at
720x480 resolution and 60
FPS in real time
HDMI has a handshaking mechanism known as
EDID, we pass this information along from the
display into the camera so that the camera transmits
data correctly.
Able to write and
read image
through DDR2
Simultaneous writing (1
port) and reading (2 ports)
from DDR2 Memory.
Instantiated IP provides arbitration to support
“simultaneous” read and writes. Achieved high
bandwidth through crossing clock domains from
Pixel Clock to System Clock
Synchronize video
after processing
Image processing done in
pipeline rather than
parallel. HDR image is fed
to processing afterwards
With proper pipelining, image processing and filter
should only add delay to the final output with barely
noticeable delays.
Original Feature Completion Status
Take in Video and Output Video Done: Input at 120 FPS at 720x480 resolution and 60 FPS in
![Page 10: ECE 532 Final Design Report: Real-time High Dynamic Range ...pc/courses/432/2012... · Image processing done in pipeline rather than parallel. HDR image is fed to processing afterwards](https://reader034.fdocuments.us/reader034/viewer/2022050311/5f72c32f78fb2c3026215343/html5/thumbnails/10.jpg)
real time
Display Video with higher dynamic
range
Done: Image shown has higher dynamic range detail than
before.
Clearer video with more information and
detail
Done: Video is enhanced through several image processing
and filtering techniques
Acceptance Criteria Acceptance Status
Reasonable Video Quality Improvement Done: Video shows clearly edges and greater detail in real
time.
See both darker and bright regions with
better detail
Done: Output HDR video shows more detail than original
video.
2.2 Possible Further Improvements
Due to the limitations in time we were only able to combine two alternating frames to produce a single
frame. Looking ahead, there could be an implementation of HDR where it combines three or four frames
in order to produce an output video with a even more higher range. There should be no bandwidth
limitations with the current MCB setup at (720 x 480) resolution with one input and four output port.
The only issue is how the frames are combined to create the HDR frames which becomes much more
sophisticated when combining more than two.
Another possible improvement could be supporting High Definition Video (1280 x 720) resolution.
Currently, bandwidth is limited to only allow one input and three outputs at that resolution with the
memory running 800MHz.
3 - Project Schedule
Milestones Proposed Goals
Actual Tasks
Involved Comment
0 Direct HDMI Video
Streaming
Learning the tools Spartan 6 uses ISE13.4.
Building the basic data
path for streaming TMDS Video Interface cores were used.
1
HDMI Video
Streaming through
DDR2
Writing streaming
controller for MCB 32 bursts per write/read was implemented.
Simulating the
controller
Consistent data output were seen.
There were no overflow/underflow issues.
![Page 11: ECE 532 Final Design Report: Real-time High Dynamic Range ...pc/courses/432/2012... · Image processing done in pipeline rather than parallel. HDR image is fed to processing afterwards](https://reader034.fdocuments.us/reader034/viewer/2022050311/5f72c32f78fb2c3026215343/html5/thumbnails/11.jpg)
Testing on the
hardware
Video was corrupted.
MCB internal FIFOs did not properly cross
over clock domains.
2 Clock Donmain-
Corssing Handling
Adding domain-
crossing FIFOs for
MCB Read/Write Added 1 wrt FIFO and 2 rd FIFOs.
Designing a more
robust domain-crossing
mechanism
Unstable timing variation propagates through
the FIFO when DDR2 refreshes. This caused
the FIFO to overflow.
Implementing domain-
corssing with circular
buffers
This produced stable output since timing
variation could no longer effect the arrival
time of the data afterward
3
Simple HDR
Processing with
BRAM LUT
Implementing
256*8bits BRAM LUT
for 2 frames
Results were pre-computed and place in the
mif files
4 Implementation of
HDR Composition
Generating 256*16bits
ROM LUTs for
Camera Response
The latest implementation combined the
product of the Camera Response and
Certainty Function to avoid multiplication.
Generating 256*16bits
ROM LUTs for
Certainty Function
Left shift operated on the raw data to preserve
the precisions for the fix-point arithmetic.
Implementing the
arithmetic
Special cases for saturation were handled by
multiplexers.
5 Implementation of
Window Provision
Designing past
frame/window
retrieving mechanism
The original approach of storing the 16-bit
frames into memory was impossible because
of the limited bandwidth.
Implementing 5x5 past
window provision with
5 line buffers
The window keeps shifting within the 5 line
buffer, which is always ahead of the window
position. This allows us to keep the original
frame size to be 720p.
6
Implementation of
Convolution and Edge
Detection
Taking the output from
the 5x5 from the frame
and convolve
The convolution is done though a 2D
Gaussian filter to blur the picture.
Edge Detection
Taking the difference between the convoluted
results and adding to the original image will
create edge detection.
4 - Description of the Blocks This section describes the custom IP cores in more detailed. Some other IP cores are also explained to
justified their usage and configurations.
4.1 dvi_decoder
![Page 12: ECE 532 Final Design Report: Real-time High Dynamic Range ...pc/courses/432/2012... · Image processing done in pipeline rather than parallel. HDR image is fed to processing afterwards](https://reader034.fdocuments.us/reader034/viewer/2022050311/5f72c32f78fb2c3026215343/html5/thumbnails/12.jpg)
The dvi_decoder module is obtain from a reference code in xapp495[2]. Turns HDMI input video into 8-
bit red, green and blue values. Provide red, green and blue of a pixel at each positive edge of pixel clock.
It also provides data valid signal, HSYNC and VSYNC.
4.2 rgb_tdp_ram_32x2048
rgb_tdp_ram_32x2048 is a True Dual Port BRAM generated using Block Memory Generator, version
6.3, in Xilinx CORE Generator. It is configured to be 32-bit wide and 2048 deep and is used as circular
line buffers. Each line of a frame from dvi_decoder is buffered before storing into MCB (for DDR2).
The circular line buffer can store 2 lines. It is 32-bit wide because each pixel contains 8-bit value of red,
green and blue (8bits * 3 = 32bits). The circular line buffer is 2048 deep to store 2 lines, each with 720
pixels.
4.3 Memory Write Controller
Memory write controller is a logic borrowed form a grad student and is not modified. The logic controls
the write enable and write address to store incoming lines of a frame into a circular buffer. It also
controls the address for reading out of the buffer; command instructions, burst length, address and write
enable for MCB to store pixels into DDR2. The code for this module can be seen in Appendix B.
4.4 Memory Controller Block The Memory Controller Block (MCB) is generated using MIG Virtex-6 and Spartan-6, version 3.91, in Xilinx
CORE Generator. It is used to write and read data to and from the DDR2 SDRAM memory on the board. MCB in
Bank 3 of the board is chosen as it has fewer multi-purpose IO pins. Alternative option is Bank 1 which uses pins
conflicting with the ones used for HDMI in and out ports on the board. The memory address mapping is chosen to
be "bank, row, and column". The memory part on the board is MT47H64M16xx-25E which supports 16-bit bus
and up to 800MHz data rate.[1] A DDR2-625Mb/s data rate is chosen for the design. Therefore DDR2 runs at
312.5 MHz clock and MCB runs at 625MHz to interface with the memory.[3] However the system clock provided
by the board is limited at 100MHz. Consequently, the PLL in MCB is modified to receive 100MHz system clock,
multiplied by 25 and divided by 4 to output 625MHz. The user clock is 78.125MHz (100*25/8) which is higher
than pixel clock (27MHz). User clock is used for MCB to receive command and write/read data from external
logic. MCB is configured to have 6 32-bit ports. 1 port is used for writing incoming frames and 2 ports are used to
read 2 different frames at the same time. Bandwidth is 625MHz*16bits = 10000Mbits. 27MHz*32bits = 864Mbits.
Therefore number of ports usable are 10000/864 = 11 ports. Therefore all 6 ports can be used if the design is to be
expanded to use more than 2 differently exposed frames in the future.
4.5 Memory Read Controller
This block interfaces and controls the data flowing between the (Spartan-6 Memory Controller)MCB
and the cross clock domain BRAM(block ram). The module ensures that the FIFO inside MCB does not
overflow with data brought from the memory and no underflow within the read FIFO. The controllers
ensure that there is always available data within the BRAM so that each frame and each pixel can be
displayed properly and consistently.
The challenge in designing this control mechanism comes from the fact that MCB does not provide a
signal that indicates when a read command is processed completely. The delay from when a command is
![Page 13: ECE 532 Final Design Report: Real-time High Dynamic Range ...pc/courses/432/2012... · Image processing done in pipeline rather than parallel. HDR image is fed to processing afterwards](https://reader034.fdocuments.us/reader034/viewer/2022050311/5f72c32f78fb2c3026215343/html5/thumbnails/13.jpg)
issued and when data is pushed into the MCB FIFO varies drastically and presents corner cases when
MCB performs a refresh on DDR2 memory. Here we discuss how we address these issues through the
use of circular line buffers and by taking advantage of horizontal and vertical blanking when outputting
the video.
The circular line buffer is 1024 Words long as mentioned in the BRAM description, which is to store a
line from the MCB, while the DVI encoder reads the other line previously stored The line buffer does
not run the risk of emptying out due to the fact that pixel clock runs at a lower frequency and there is an
extra time interval known as horizontal blanking where no pixel data is read, which gives us even more
time to complete filling a line from the MCB to BRAM.
The delay between when a command is issued and data being ready usually floats around 22 cycles. This
controller issues commands to read in bursts of 32 words per command. The rationale behind using 32
words burst depends on the fact that the MCB read FIFO is only 64 deep. The idea is to issue the
minimal amount of commands to attain the data to avoid extra processing delays. However, because we
cannot control the rate of data being pushed into the MCB read FIFO, using 64 word bursts is only
reliable if we issue the next command after the FIFO is emptied out to prevent overflows, leaving 22
cycles of idleness every time a command is issued. Using 32 word bursts allows two commands to be
processed at the same time without overflow, issuing the next command as soon as 32 words have been
read. This allows constant data to be streamed out of the MCB FIFO into the BRAM, creating a much
more efficient implementation without the possibility of overflow. Code snippet describing the logic can
be seen in Appendix B.
This block shares the same clock as the user clock provided by the MCB which runs at 97Mhz. This
module is instantiated twice to read from two different ports. An arbiter inside the MCB divides the time
slices evenly in order to process simultaneous reads and writes from multiple ports.
4.6 rgb_lut_no_li
rgb_lut_no_li is Look up table borrowed from a grad student and is not modified. The module
concatenates pixel values of 2 frames to produce an address for the look up table. This is done separately
for red, green and blue channel. The look up table is created using BRAM , rgb_r8stops_lut,
rgb_g8stops_lut, rgb_b8stops_lut. The look up table contained in these BRAM are located at
rtl/rgb_lut/rgb_r8stops_lut.coe, rtl/rgb_lut/rgb_g8stops_lut.coe, and rtl/rgb_lut/rgb_b8stops_lut.coe in
Project Tree section.
4.7 HDR Composite Block: composite_top
HDR Image Composition is the first step of the HDR processing. The purpose of this step is to take two
differently exposed images as inputs and combine them into an HDR image. Moreover, although the
image is represented as 8-bit pixel values in three channels, the output is a combined 16-bit
![Page 14: ECE 532 Final Design Report: Real-time High Dynamic Range ...pc/courses/432/2012... · Image processing done in pipeline rather than parallel. HDR image is fed to processing afterwards](https://reader034.fdocuments.us/reader034/viewer/2022050311/5f72c32f78fb2c3026215343/html5/thumbnails/14.jpg)
photoquantity value. We wish to use this format because it gives us more control over the processing.
The photoquantity is a tonal value output from the camera and is linear to the sensor response thus by
simple scalar multiplication, we can vary the exposure of the resulting image.
Let’s define the function f to be the camera response function and the pixel value to be f1 and f2 from
the differently exposed images. The photoquantity, q, is estimated as follows with the certainty function
w for weighted average:
The FPGA implementation of this process is called composite_top, and it is designed in a way to
preserve the precision and speed of the math operation. The calculation result is place into a multiplexer
in the end to take care of the special cases mentioned in above equation. Since the processes are the
same for all three channels in RGB, block diagram of only R channel is given below, with descript of
each block:
1. BRAM LUT “lut_rw_lsh27” takes in an 8-bit pixel value and gives the product of the inverse
camera response and weighted function corresponding to the pixel value. The product value is
left shifted by 17 bit to fill the 32-bit bus in order to preserve the precision.
2. BRAM LUT “lut_w_lsh17” takes in an 8-bit pixel value and gives the corresponding certainty
for a pixel value. It is left shifted 17 bit so that the max q result does not overflow the 16-bit bus
after the division.
3. Arith block is called “composite_arith_operator” in the code. It takes in the two products and the
two certainty values then outputs the quotient in the end according to the equation above.
Because of the shifting choice made in the LUT, the resulting q is a 10-bit right shifted value.
Further arithmetic operation such as convolution can be performed on this shifted version of q.
After each photoquantity has been calculated and properly muxed. The three channels were then
normalized and combined into one channel called luminance L. In this way, we can save a huge amount
of resource that was required originally by three independent RGB channels. The equation involve here
is:
The complete block diagram of composite_top is provided in Figure 6 in appendices.
4.8 Window Provision Block: framex_line_buffers
![Page 15: ECE 532 Final Design Report: Real-time High Dynamic Range ...pc/courses/432/2012... · Image processing done in pipeline rather than parallel. HDR image is fed to processing afterwards](https://reader034.fdocuments.us/reader034/viewer/2022050311/5f72c32f78fb2c3026215343/html5/thumbnails/15.jpg)
The main functions of Framex_line_buffers module are to store 5 lines of a video frame and output a
5x5 window of pixels. Figure 3 in Appendix 1 provide an example of a 5x5 window within a frame. In
order to provide 25 pixels at every clock, 5 different lines of a frame must be available. Therefore 5 48-
bit wide and 720 deep BRAMs are required (line buffer). It is 48-bit wide because, each pixel consists of
16-bit red, green and blue channels (3*16=48). Each instantiation cost 3 BRAMs (18k bits), so the
framex_line_buffers module cost 15 BRAMs (3*5=15).
Figure 3 in Appendix A depicts the operation of the module. A new valid 16-bit pixel comes at ever
positive clock edge except during VSYNC and HSYNC. Write Control selects which BRAM to write to
in a circular fashion. At the beginning of a frame, the first line buffer is used until the first line (720
pixels) is stored or HSYNC triggers. Then Write Control uses the second line buffer to store the second
line. After storing the 5th line on the 5th line buffer, Write Control selects the first line buffer to store
the 6th line. The process repeats until VSYNC triggers, at which Write Control resets to use the first line
buffer to store first line of frame again. Write address of line buffer increments from 0 to 719 and resets
during HSYNC. While only one line buffer is written to at any given time, a pixel is read from all five
line buffers with the same read address. Read Control increments read address which is always 1'b1
smaller than write address to avoid write/read conflict. Each of the five pixels is flopped by a shift
register consists of five flip flops. So five shift registers are used, each associates to a line buffer. While
read address of line buffers increases at each clock, a new pixel goes into the first flip flop and the pixel
value from six clock cycles ago in the 5th flip flop get replaced by the pixel 5 clock cycles ago. The five
flip flop in a shift register provides five different pixel of a line and 5 shift registers provide 25 pixels
(5x5 window). The window shifts to the right with read address increases at every clock cycle;
meanwhile, values in the window shifts to the left. At every HSYNC, the window shifts downwards. To
produce such effect, Window Control chooses the shift registers to construct the window in a circular
fashion as well. The shift register associated to the line buffer that is being written to, is used for the
bottom row of the window. The shift register associated to the immediate previously written line buffer
is used for second bottom row and so on.
The line buffers used in this module are instantiation of rgb_post_ram_48x720. It is a True Dual Port
BRAM generated using Block Memory Generator, version 6.3, in Xilinx CORE Generator. It is
configured to be 48-bit wide and 720 deep. It is 48-bit wide because each red, green and blue channel of
pixel from composite_top is 16 bits (16bits*3 = 48). It is 720 deep because a line has 720 pixels.
4.9 Convolution Block: conv5x5
Convolution processes the window given to us by the 5 x 5 frame buffer. Our approach uses a 2-D
Gaussian blurring method to blur the video input. The distribution is seen in figure. However, it is very
costly to perform accurate multiplication and division. Using multipliers uses an unreasonable amount of
resources and cause timing issues because of the delay they introduce. Instead we approach convolution
with multipliers in powers of 2 in order to dramatically reduce resource usage and provide a decent
approximation of the actual result.
![Page 16: ECE 532 Final Design Report: Real-time High Dynamic Range ...pc/courses/432/2012... · Image processing done in pipeline rather than parallel. HDR image is fed to processing afterwards](https://reader034.fdocuments.us/reader034/viewer/2022050311/5f72c32f78fb2c3026215343/html5/thumbnails/16.jpg)
An example of this process can be seen in figure 1 in appendices. Convolution example with a 3x3
example, each element in the “kernel” is multiplied with a corresponding Gaussian distribution and
summed together to produce the output for the middle pixel in order to blur the image.
4.10 dvi_encoder
The dvi_decoder module is obtain from a reference code in xapp495[2]. At every pixel clock, it takes 8-
bit red, green and blue values and output a HDMI stream to the board. The module also receives data
valid, HSYNC and VSYNC as inputs.
Project Tree:
Dirrectory:
bit: Some generated bit files throughout the term
cores: All the IP (Xilinx/External) used in the design
doc: Group report and paper draft
ise: ISE folder
rtl: Verilog code of the design
sim: Testbench for simulation
Description for ./rtl:
common: House-keeping block such as debounce and pixel generator for
simulation.
composite: HDR composition, taking in multiple image and combining them into HDR
image for further processing by rgb_proc. Spreadsheet for simulation is also included.
mcb: Memory control block from Xilinx
rgb_lut: BRAM LUT for Simple HDR processing, take in two image and output the
result.
rgb_proc: More complex HDR processing, contains composition, window provision,
convolution and edge detection.
rx: DVI decoder from Xilinx.
tx: DVI encoder from Xilinx.
![Page 17: ECE 532 Final Design Report: Real-time High Dynamic Range ...pc/courses/432/2012... · Image processing done in pipeline rather than parallel. HDR image is fed to processing afterwards](https://reader034.fdocuments.us/reader034/viewer/2022050311/5f72c32f78fb2c3026215343/html5/thumbnails/17.jpg)
atlys.ucf: User constraint file.
atlys.v: Top-level Module used for both hardware test and simulation
atlys_top: Hardware implementation
atlys_top_tb: Simulation testbench
framex_line_buffers.v: Window provision implementation, used before convolution.
![Page 18: ECE 532 Final Design Report: Real-time High Dynamic Range ...pc/courses/432/2012... · Image processing done in pipeline rather than parallel. HDR image is fed to processing afterwards](https://reader034.fdocuments.us/reader034/viewer/2022050311/5f72c32f78fb2c3026215343/html5/thumbnails/18.jpg)
Reference
[1] atlys rm, http://www.digilentinc.com/Data/Products/ATLYS/Atlys_rm.pdf
[2] xapp495
[3] http://www.xilinx.com/support/answers/43663.htm
![Page 19: ECE 532 Final Design Report: Real-time High Dynamic Range ...pc/courses/432/2012... · Image processing done in pipeline rather than parallel. HDR image is fed to processing afterwards](https://reader034.fdocuments.us/reader034/viewer/2022050311/5f72c32f78fb2c3026215343/html5/thumbnails/19.jpg)
Appendix A
Figure 1
Diagram showing the layout of the window with the appropriate Gaussian distribution values.
Figure 2
Pictures showing how the input pixels are mapped into final output of the pixel, referenced from
http://www.songho.ca/dsp/convolution/convolution.htm
![Page 20: ECE 532 Final Design Report: Real-time High Dynamic Range ...pc/courses/432/2012... · Image processing done in pipeline rather than parallel. HDR image is fed to processing afterwards](https://reader034.fdocuments.us/reader034/viewer/2022050311/5f72c32f78fb2c3026215343/html5/thumbnails/20.jpg)
Figure 3
Example of a 5x5 window in a video frame
5x5 window
A video frame
Figure 4
detailed data path of framex_line_buffers module
Write ControlPixel
rgb_post_ram_48x720
Line buffer 1 (BRAM)
rgb_post_ram_48x720
Line buffer 3 (BRAM)
rgb_post_ram_48x720
Line buffer 2 (BRAM)
rgb_post_ram_48x720
Line buffer 4 (BRAM)
rgb_post_ram_48x720
Line buffer 5 (BRAM)
48 bits
48 bits
48 bits
Window Control
Read Control
5x5 window
Shift register 1
D Q D Q D Q D Q D Q
Shift register 5
D Q D Q D Q D Q D Q
Shift register 4
D Q D Q D Q D Q D Q
Shift register 3
D Q D Q D Q D Q D Q
Shift register 2
D Q D Q D Q D Q D Q
48 bits
48 bits
![Page 21: ECE 532 Final Design Report: Real-time High Dynamic Range ...pc/courses/432/2012... · Image processing done in pipeline rather than parallel. HDR image is fed to processing afterwards](https://reader034.fdocuments.us/reader034/viewer/2022050311/5f72c32f78fb2c3026215343/html5/thumbnails/21.jpg)
Figure 5
data path showing calculation done on a single color of a pixel.
Figure 6
Complete block diagram of composite_top is provided as follows, from 8-bit pixel to L:
![Page 22: ECE 532 Final Design Report: Real-time High Dynamic Range ...pc/courses/432/2012... · Image processing done in pipeline rather than parallel. HDR image is fed to processing afterwards](https://reader034.fdocuments.us/reader034/viewer/2022050311/5f72c32f78fb2c3026215343/html5/thumbnails/22.jpg)
Appendix B
Code snippet showing the logic behind the MCB read controller:
//
always @(posedge c3_clk0) begin
if ((c3_rst0 == 1'b1) || (c3_calib_done == 1'b0)) begin
c3_cmd_en <= 1'b0;
end else begin
if (c3_cmd_en == 1'b1) begin
c3_cmd_en <= 1'b0;
end else if ( (c3_p1_cmd_cnt < MAX_N_CMDS) && // not exceeded maximum allowable
commands
(c3_p2_cmd_cnt < MAX_N_CMDS) &&
(c3_p1_cmd_full == 1'b0) && // command p1 fifo not full
(c3_p2_cmd_full == 1'b0) && // command p2 fifo not full
(p1_ram_addra[BRAM_PAGE] == 1'b0) &&
(p2_ram_addra[BRAM_PAGE] == 1'b0)) begin
c3_cmd_en <= 1'b1;
end else begin
c3_cmd_en <= 1'b0;
end
end
end
Code for Memory Write Controller borrowed from a grad student:
assign p0_ram_din = {8'b0, rx1_blue[7:0], rx1_green[7:0], rx1_red[7:0]};
assign p0_ram_addra_full = { rx1_ram_page, p0_ram_addra[BRAM_PAGE-1:0]};
assign p0_ram_addrb_full = {!c3_ram_page, p0_ram_addrb[BRAM_PAGE-1:0]};
// Generate P0 RAM Write Address
always @(posedge rx1_pclk) begin
if (rx1_reset == 1'b1) begin
p0_ram_addra <= 11'b11111111111; // This will force bram writes to wait until hsync_rise
end else if (rx1_hsync_rise == 1'b1) begin
p0_ram_addra <= 11'b0;
end else if (p0_ram_we == 1'b1) begin
p0_ram_addra <= p0_ram_addra + 11'b1;
end
end
assign p0_ram_we = rx1_de & !p0_ram_addra[BRAM_PAGE];
![Page 23: ECE 532 Final Design Report: Real-time High Dynamic Range ...pc/courses/432/2012... · Image processing done in pipeline rather than parallel. HDR image is fed to processing afterwards](https://reader034.fdocuments.us/reader034/viewer/2022050311/5f72c32f78fb2c3026215343/html5/thumbnails/23.jpg)
// Generate P0 RAM Read Address
// Use the MSBit to control mcb fifo write enable
always @(posedge c3_clk0) begin
if ((c3_rst0 == 1'b1) || (c3_calib_done == 1'b0)) begin
p0_ram_addrb <= 11'b11111111111; // This will force cmd_en to wait until hsync_rise
end else if (c3_hsync_rise == 1'b1) begin
p0_ram_addrb <= 11'b0;
end else if (c3_p0_wr_en == 1'b1) begin
p0_ram_addrb <= p0_ram_addrb + 11'b1;
end
end
// Generate RAM Address Page
always @(posedge c3_clk0) begin
if ((c3_rst0 == 1'b1) || (c3_calib_done == 1'b0)) begin
c3_ram_page <= 1'b0;
end else if (c3_hsync_rise == 1'b1) begin
c3_ram_page <= !c3_ram_page;
end
end
// Flop c3_ram_page into rx1_clk Domain
always @(posedge rx1_pclk) begin
rx1_ram_page_d[1:0] <= {rx1_ram_page_d[0],c3_ram_page};
end
assign rx1_ram_page = rx1_ram_page_d[1];
always @(posedge c3_clk0) begin
if ((c3_rst0 == 1'b1) || (c3_calib_done == 1'b0)) begin
c3_p0_wr_full_d <= 1'b1;
end else begin
c3_p0_wr_full_d <= c3_p0_wr_full;
end
end
assign c3_p0_cmd_en = c3_p0_wr_full & !c3_p0_wr_full_d;
assign c3_p0_cmd_instr = (c3_p0_cmd_en) ? 3'b010 : 3'b000;
assign c3_p0_cmd_bl = (c3_p0_cmd_en) ? 6'b111111 : 6'b000000; // TODO Check if necessary
assign c3_p0_wr_mask = 4'b0;
assign c3_p0_wr_data = p0_ram_dout;
// Count ammount of data write into of mcb p1 fifo
always @(posedge c3_clk0) begin // p0_ram_addrb is used here to prevent BRAM address overflow
if ((c3_rst0 == 1'b1) || (c3_calib_done == 1'b0) || (p0_ram_addrb[BRAM_PAGE] == 1'b1)) begin
// NOTE: Make sure the counter resets at BURST SIZE boundaries!
c3_p0_wr_cnt <= 7'b0;
end else begin
if (|c3_p0_wr_cnt == 1'b1) begin // Decrement counter to zero
![Page 24: ECE 532 Final Design Report: Real-time High Dynamic Range ...pc/courses/432/2012... · Image processing done in pipeline rather than parallel. HDR image is fed to processing afterwards](https://reader034.fdocuments.us/reader034/viewer/2022050311/5f72c32f78fb2c3026215343/html5/thumbnails/24.jpg)
c3_p0_wr_cnt <= c3_p0_wr_cnt - 7'b1;
end else if (c3_p0_wr_empty == 1'b1) begin // Reset counter when mcb fifo is empty and it is known
64-words can fit
// KO-TODO: This implementation is not optimal based on empty, try to pipeline
// transactions, fix later.
c3_p0_wr_cnt <= MCB_FIFO_SIZE;
end
end
end
// We expect to write EXACTLY 64 x 32-bit words, this will happen EXACTLY
// 16 times for a total of 1024 x 32-bit words per line.
assign c3_p0_wr_en = |c3_p0_wr_cnt;
// Delay write enable to account for BRAM dout delay
always @(posedge c3_clk0) begin
if ((c3_rst0 == 1'b1) || (c3_calib_done == 1'b0)) begin
c3_p0_wr_en_d <= 2'b00;
end else begin
c3_p0_wr_en_d <= {c3_p0_wr_en_d[0], c3_p0_wr_en};
end
end
//
// Generate Frame and Line Addresses
//
reg [9:0] c3_cmd_line_addr_2x; //Reset every 240 (= 480/2) lines
localparam [9:0] LINE_SHIFT = 10'b1111000; //120
localparam [10:0] BYTE_SHIFT = 11'b10110100; //180
always @(posedge c3_clk0) begin
if ((c3_rst0 == 1'b1) || (c3_calib_done == 1'b0)) begin
c3_cmd_frame_addr <= 3'b0;
c3_cmd_line_addr <= 10'b0;
c3_cmd_line_addr_2x <= 10'b0;
end else begin
if (c3_vsync_rise == 1'b1) begin // beginning of vsync
c3_cmd_line_addr <= 10'b0;
c3_cmd_line_addr_2x <= LINE_SHIFT;
c3_cmd_frame_addr <= c3_cmd_frame_addr + 3'b1;
end else begin
if (c3_cmd_line_addr_2x > (10'b100010000 + LINE_SHIFT)) begin
c3_cmd_line_addr_2x <= LINE_SHIFT; // every 240 lines, zero to read again
end
if (c3_hsync_rise == 1'b1) begin // beginning of hsync
c3_cmd_line_addr <= c3_cmd_line_addr + 10'b1;
c3_cmd_line_addr_2x <= c3_cmd_line_addr_2x + 10'b1;
end
![Page 25: ECE 532 Final Design Report: Real-time High Dynamic Range ...pc/courses/432/2012... · Image processing done in pipeline rather than parallel. HDR image is fed to processing afterwards](https://reader034.fdocuments.us/reader034/viewer/2022050311/5f72c32f78fb2c3026215343/html5/thumbnails/25.jpg)
end
end
end
always @(posedge c3_clk0) begin
if ((c3_rst0 == 1'b1) || (c3_calib_done == 1'b0)) begin
c3_p0_cmd_4byte_addr <= 11'b0;
end else begin
if (c3_hsync_rise == 1'b1) begin // beginning of hsync
c3_p0_cmd_4byte_addr <= 11'b0;
end else if (c3_p0_wr_en_d[1] == 1'b1) begin
// Increment burst address
c3_p0_cmd_4byte_addr <= c3_p0_cmd_4byte_addr + 11'b1;
end
end
end
// 27-bits Address 128MB of SDRAM
assign c3_p0_cmd_byte_addr = {1'b0,c3_cmd_frame_addr[2:0],
c3_cmd_line_addr[9:0],
(c3_p0_cmd_4byte_addr[10:0]-11'b1000000), 2'b0}; // Convert to byte address
//CALVIN - minus 64 instead of 63 becuase cmd_en doesn't aligned with clock edge and
takes the next address instead
assign c3_p0_cmd_byte_addr_full = {3'b0,c3_p0_cmd_byte_addr}; // add padding, DRAM part is only
128MB