Hi,
I have a question regarding configuration parameters of ruby.
I'm running raytrace, a threaded benchmark application from ALPbench
benchmark suite and I have a problem with the difference between running
simics in default mode (1 cycle per instruction) and when running with ruby.
I have L1 cache statistics that basically say that I have an almost
perfect hit-ratio in both instruction and data-cache. Still, the
execution time is several factors higher compared to the case when I run
the same application in simics without ruby.
I expected the difference to be much smaller since the cache hit ratio
is so good.
Do you have a possible explanation for this strange result? Since I'm
not an expert on ruby, I can imagine that some of the parameters in the
configuration file might affect the overall result significantly. You
can find my configuration parameters below.
Regards,
Mladen
g_RANDOM_SEED: 1
g_DEADLOCK_THRESHOLD: 50000
g_FORWARDING_ENABLED: false
RANDOMIZATION: false
g_SYNTHETIC_DRIVER: false
g_DETERMINISTIC_DRIVER: false
// MOSI_CMP_token1 parameters
g_FILTERING_ENABLED: false
g_DISTRIBUTED_PERSISTENT_ENABLED: true
g_RETRY_THRESHOLD: 1
g_DYNAMIC_TIMEOUT_ENABLED: true
g_FIXED_TIMEOUT_LATENCY: 300
g_trace_warmup_length: 1000000
g_bash_bandwidth_adaptive_threshold: 0.75
g_tester_length: 0
// # of synthetic locks == 16 * 128
g_synthetic_locks: 2048
g_deterministic_addrs: 1
g_SpecifiedGenerator: DetermInvGenerator
g_callback_counter: 0
g_NUM_COMPLETIONS_BEFORE_PASS: 0
g_think_time: 5
g_hold_time: 5
g_wait_time: 5
PROTOCOL_DEBUG_TRACE: true
DEBUG_FILTER_STRING: none
DEBUG_VERBOSITY_STRING: none
DEBUG_START_TIME: 0
DEBUG_OUTPUT_FILENAME: none
SIMICS_RUBY_MULTIPLIER: 2
OPAL_RUBY_MULTIPLIER: 2
TRANSACTION_TRACE_ENABLED: false
USER_MODE_DATA_ONLY: false
PROFILE_HOT_LINES: false
PROFILE_ALL_INSTRUCTIONS: false
PRINT_INSTRUCTION_TRACE: false
BLOCK_STC: false
PERFECT_MEMORY_SYSTEM: false
PERFECT_MEMORY_SYSTEM_LATENCY: 0
DATA_BLOCK: false
REMOVE_SINGLE_CYCLE_DCACHE_FAST_PATH: false
// *********************************************
// CACHE & MEMORY PARAMETERS
// *********************************************
g_SIMICS: true
L1_CACHE_ASSOC: 4
L1_CACHE_NUM_SETS_BITS: 7
L2_CACHE_ASSOC: 4
L2_CACHE_NUM_SETS_BITS: 11
// 32 bits = 4 GB address space
g_MEMORY_SIZE_BYTES: 4294967296
g_DATA_BLOCK_BYTES: 64
g_PAGE_SIZE_BYTES: 4096
g_NUM_PROCESSORS: 0
g_NUM_L2_BANKS: 0
g_NUM_MEMORIES: 0
g_PROCS_PER_CHIP: 1
// The following group of parameters are calculated. They must
// _always_ be left at zero.
g_NUM_CHIPS: 0
g_NUM_CHIP_BITS: 0
g_MEMORY_SIZE_BITS: 0
g_DATA_BLOCK_BITS: 0
g_PAGE_SIZE_BITS: 0
g_NUM_PROCESSORS_BITS: 0
g_PROCS_PER_CHIP_BITS: 0
g_NUM_L2_BANKS_BITS: 0
g_NUM_L2_BANKS_PER_CHIP: 0
g_NUM_L2_BANKS_PER_CHIP_BITS: 0
g_NUM_MEMORIES_BITS: 0
g_NUM_MEMORIES_PER_CHIP: 0
g_MEMORY_MODULE_BITS: 0
g_MEMORY_MODULE_BLOCKS: 0
// determines whether the lowest bits of a block address
// are used to index to a L2 cache bank or into the sets of a
// single bank
//
lowest highest
// true: g_DATA_BLOCK_BITS | g_NUM_L2_BANKS_PER_CHIP_BITS |
L2_CACHE_NUM_SETS_BITS
// false: g_DATA_BLOCK_BITS | L2_CACHE_NUM_SETS_BITS |
g_NUM_L2_BANKS_PER_CHIP_BITS
MAP_L2BANKS_TO_LOWEST_BITS: true
// TIMING PARAMETERS
DIRECTORY_CACHE_LATENCY: 6
NULL_LATENCY: 1
ISSUE_LATENCY: 2
CACHE_RESPONSE_LATENCY: 12
L1_RESPONSE_LATENCY: 1
L2_RESPONSE_LATENCY: 6
COLLECTOR_REQUEST_LATENCY: 1
MEMORY_RESPONSE_LATENCY_MINUS_2: 78
DIRECTORY_LATENCY: 80
NETWORK_LINK_LATENCY: 40
COPY_HEAD_LATENCY: 4
ON_CHIP_LINK_LATENCY: 1
RECYCLE_LATENCY: 3
L2_RECYCLE_LATENCY: 5
TIMER_LATENCY: 10000
TBE_RESPONSE_LATENCY: 1
PERIODIC_TIMER_WAKEUPS: true
// constants used by TM protocols
RETRY_LATENCY: 100
RESTART_DELAY: 1000
PROFILE_EXCEPTIONS: false
PROFILE_XACT: false
XACT_NUM_CURRENT: 0 // must be 0
XACT_LAST_UPDATE: 0 // must be 0
// constants used by CMP protocols
// cache bank access times
L1_REQUEST_LATENCY: 2
L2_REQUEST_LATENCY: 4
// Allows on a single accesses to a multi-cycle L2 bank.
// Ensures the cache array is only accessed once for every
L2_REQUEST_LATENCY
// number of cycles. However the TBE table can be accessed in parallel.
SINGLE_ACCESS_L2_BANKS: true
// Ruby cycles between when a sequencer issues a request and it arrives at
// the L1 cache controller
SEQUENCER_TO_CONTROLLER_LATENCY: 4
// Number of transitions each controller state machines can complete per
cycle
// i.e. the number of ports to each controller
// L1cache is the sum of the L1I and L1D cache ports
L1CACHE_TRANSITIONS_PER_RUBY_CYCLE: 32
// Note: if SINGLE_ACCESS_L2_BANKS is enabled, this will probably enforce a
// much greater constraint on the concurrency of a L2 cache bank
L2CACHE_TRANSITIONS_PER_RUBY_CYCLE: 32
DIRECTORY_TRANSITIONS_PER_RUBY_CYCLE: 32
COLLECTOR_TRANSITIONS_PER_RUBY_CYCLE: 32
// Maximum number of requests (including SW prefetches) outstanding from
// the sequencer (Note: this also include items buffered in the store
// buffer)
g_SEQUENCER_OUTSTANDING_REQUESTS: 16
// Number of TBEs available for demand misses, ALL prefetches, and
replacements
// used by one-level protocols
NUMBER_OF_TBES: 128
// two-level protocols
NUMBER_OF_L1_TBES: 32
NUMBER_OF_L2_TBES: 32
// Number of Monitor Activity Entries
NUMBER_OF_MATES: 4
// NOTE: Finite buffering allows us to simulate a realistic virtual
cut-through
// routed network with idealized flow control.
FINITE_BUFFERING: false
// All message buffers within the network (i.e. the switch's input and
// output buffers) are set to the size specified below by the
FINITE_BUFFER_SIZE
FINITE_BUFFER_SIZE: 3
// g_SEQUENCER_OUTSTANDING_REQUESTS (above) controlls the number of
demand requests
// issued by the sequencer. The PROCESSOR_BUFFER_SIZE controlls the
// number of requests in the mandatory queue
// Only effects the simualtion when FINITE_BUFFERING is enabled
PROCESSOR_BUFFER_SIZE: 10
// The PROTOCOL_BUFFER_SIZE limits the size of all other buffers
connecting to
// Controllers. Controlls the number of request issued by the L2 HW
Prefetcher
PROTOCOL_BUFFER_SIZE: 32
TSO: false
g_MASK_PREDICTOR_CONFIG: AlwaysBroadcast
g_TOKEN_REISSUE_THRESHOLD: 2
g_PERSISTENT_PREDICTOR_CONFIG: None
g_NETWORK_TOPOLOGY: HIERARCHICAL_SWITCH
g_CACHE_DESIGN: NUCA
g_endpoint_bandwidth: 10000
g_adaptive_routing: true
NUMBER_OF_VIRTUAL_NETWORKS: 6
FAN_OUT_DEGREE: 4
g_PRINT_TOPOLOGY: false
// the following variables must be calculated
g_NUM_DNUCA_BANK_SET_BITS: 0
g_NUM_BANKS_IN_BANK_SET_BITS: 0
g_NUM_BANKS_IN_BANK_SET: 0
// NUCA variables
g_NUM_DNUCA_BANK_SETS: 32
g_NUCA_PREDICTOR_CONFIG: NULL
ENABLE_MIGRATION: false
ENABLE_REPLICATION: false
COLLECTOR_HANDLES_OFF_CHIP_REQUESTS: false
// when PERFECT_DNUCA_SEARCH is only valid for DNUCA protocols
// assumes perfect L2 cache knowledge at the L1 controllers
PERFECT_DNUCA_SEARCH: true
|