LLM-Psychometrics.github.io/index.html at master · ValueByte-AI/LLM-Psychometrics.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <!-- Meta tags for social media banners -->
  <meta name="description" content="Large Language Model Psychometrics: A Systematic Review of Evaluation, Validation, and Enhancement">
  <meta property="og:title" content="Large Language Model Psychometrics: A Systematic Review of Evaluation, Validation, and Enhancement"/>
  <meta property="og:description" content="A systematic review on large language model evaluation methods"/>
  <meta property="og:url" content="URL OF THE WEBSITE"/>
  <meta property="og:image" content="static/images/overview_000.png" />
  <meta property="og:image:width" content="1200"/>
  <meta property="og:image:height" content="630"/>

  <meta name="twitter:title" content="Large Language Model Psychometrics: A Systematic Review of Evaluation, Validation, and Enhancement">
  <meta name="twitter:description" content="A systematic review on large language model evaluation methods">
  <meta name="twitter:image" content="static/images/fig.png">
  <meta name="twitter:card" content="summary_large_image">
  <meta name="keywords" content="large language models, LLM, psychometrics, evaluation, validation, enhancement, survey">
  <meta name="viewport" content="width=device-width, initial-scale=1">

  <title>Large Language Model Psychometrics:
    A Systematic Review of
    Evaluation, Validation, and Enhancement
  </title>
  <link rel="icon" type="image/x-icon" href="static/images/llm_psychometrics.ico">
  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet">

  <link rel="stylesheet" href="static/css/bulma.min.css">
  <link rel="stylesheet" href="static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="static/css/fontawesome.all.min.css">
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="static/css/index.css">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script src="https://documentcloud.adobe.com/view-sdk/main.js"></script>
  <script defer src="static/js/fontawesome.all.min.js"></script>
  <script src="static/js/bulma-carousel.min.js"></script>
  <script src="static/js/bulma-slider.min.js"></script>
  <script src="static/js/index.js"></script>

  <!-- Google tag (gtag.js) -->
  <script async src="https://www.googletagmanager.com/gtag/js?id=G-HWFCQDEG04"></script>
  <script>
    window.dataLayer = window.dataLayer || [];
    function gtag(){dataLayer.push(arguments);}
    gtag('js', new Date());
    gtag('config', 'G-HWFCQDEG04');
  </script>

<style>
    .publication-title {
      font-size: 1.4rem;
      line-height: 1.2;
      font-weight: 600;
      margin-bottom: 1rem;
      max-width: 1500px;
      margin-left: auto;
      margin-right: auto;
    }

    @media screen and (max-width: 768px) {
      .publication-title {
        font-size: 1.6rem; /* Reduced from 1.8rem */
      }
    }

    .image-caption {
      text-align: center;
      font-size: 0.95rem;
      color: #555;
      margin-top: 0.7rem;
      margin-bottom: 2rem;
      font-style: italic;
      font-weight: bold; /* Added to make captions bold */
    }

    .section-title {
      margin-top: 2.5rem;
      margin-bottom: 1.5rem;
      font-weight: 700;
      color: #363636;
      position: relative;
      padding-bottom: 0.5rem;
    }

    .section-title:after {
      content: '';
      position: absolute;
      bottom: 0;
      left: 0;
      width: 50px;
      height: 3px;
      background-color: #485fc7;
    }

    .abstract {
      background-color: #f9f9f9;
      padding: 2rem;
      border-radius: 8px;
      margin-bottom: 2.5rem;
      box-shadow: 0 2px 5px rgba(0,0,0,0.05);
      font-size: 1.05rem;
      line-height: 1.6;
      width: 100%; /* Made abstract wider */
    }

    .highlight {
      color: #485fc7;
      font-weight: 600;
    }

    .image-container {
      margin-bottom: 3rem;
      transition: transform 0.3s ease;
      border-radius: 8px;
      overflow: hidden;
      box-shadow: 0 4px 12px rgba(0,0,0,0.1);
    }

    .image-container img {
      display: block;
      width: 100%;
      transition: transform 0.3s ease;
    }

    .image-container:hover {
      transform: translateY(-5px);
    }

    .feature-box {
      background-color: #f9f9f9;
      padding: 1.8rem;
      border-radius: 8px;
      height: 100%;
      transition: all 0.3s ease;
      box-shadow: 0 2px 5px rgba(0,0,0,0.05);
      border-left: 4px solid #485fc7;
    }

    .feature-box:hover {
      transform: translateY(-5px);
      box-shadow: 0 8px 15px rgba(0,0,0,0.1);
    }

    .feature-icon {
      font-size: 2.2rem;
      color: #485fc7;
      margin-bottom: 1.2rem;
    }

    body {
      font-family: 'Noto Sans', sans-serif;
      line-height: 1.6;
      color: #333;
    }

    .title, .subtitle {
      font-family: 'Google Sans', 'Noto Sans', sans-serif;
    }

    .content p {
      margin-bottom: 1.5rem;
      font-size: 1.05rem;
    }

    .section {
      padding: 3rem 1.5rem;
    }

    .hero-body {
      padding: 3rem 1.5rem 2rem;
    }

    .citation-box {
      background-color: #f5f5f5;
      padding: 1.5rem;
      border-radius: 8px;
      font-family: monospace;
      overflow-x: auto;
      font-size: 0.9rem;
      line-height: 1.5;
      border: 1px solid #e0e0e0;
    }

    .footer {
      background-color: #fafafa;
      padding: 3rem 1.5rem;
    }

    .button.is-dark {
      background-color: #363636;
      transition: all 0.3s ease;
    }

    .button.is-dark:hover {
      background-color: #485fc7;
    }

    .authors-section {
      margin-bottom: 1.5rem;
    }

    .author-block {
      font-size: 1.1rem;
    }

    .publication-logo img {
      max-height: 100px;
      margin-bottom: 1rem;
    }

    .main-image-section {
      padding-top: 0;
    }

    .key-features {
      margin-top: 2.5rem;
    }

    .content ul li {
      margin-bottom: 0.7rem;
    }

    .top-overview-image {
      margin-top: 2rem;
      margin-bottom: 1rem;
    }

    /* Added styles for highlighted sections */
    .highlighted-section {
      background-color: #f5f5f5;
      padding: 2.5rem;
      border-radius: 10px;
      margin-bottom: 2rem;
      box-shadow: 0 2px 8px rgba(0,0,0,0.05);
    }

    /* Style for when image is above text */
    .reversed-layout .content {
      margin-top: 2rem;
    }

    /* Comparison Table Styles */
    .comparison-table {
      width: 100%;
      border-collapse: separate;
      border-spacing: 0;
      border-radius: 8px;
      overflow: hidden;
      box-shadow: 0 4px 12px rgba(0,0,0,0.1);
      margin-bottom: 2rem;
    }

    .comparison-table th {
      background-color: #485fc7;
      color: white;
      padding: 1rem;
      text-align: left;
      font-weight: 600;
      font-size: 1.05rem;
    }

    .comparison-table tr:nth-child(even) {
      background-color: #f5f7ff;
    }

    .comparison-table tr:nth-child(odd) {
      background-color: white;
    }

    .comparison-table td {
      padding: 1rem;
      border-bottom: 1px solid #eaeaea;
      font-size: 0.95rem;
      vertical-align: top;
    }

    .comparison-table tr:last-child td {
      border-bottom: none;
    }

    .comparison-table td:first-child {
      font-weight: 600;
      color: #485fc7;
      width: 18%;
    }

    .comparison-table td:nth-child(2),
    .comparison-table td:nth-child(3) {
      width: 41%;
    }

    .table-container {
      overflow-x: auto;
      margin-top: 2rem;
    }

    .table-caption {
      text-align: center;
      font-weight: bold;
      margin-bottom: 1rem;
      font-size: 1.1rem;
      color: #485fc7;
    }

    @media screen and (max-width: 768px) {
      .comparison-table td, .comparison-table th {
        padding: 0.75rem;
        font-size: 0.9rem;
      }
    }

    /* New enhanced styles for text content sections */
    .text-content-box {
      background: linear-gradient(to right, #f8f9ff, #eef1ff);
      padding: 2rem;
      border-radius: 12px;
      margin-top: 1.5rem;
      margin-bottom: 2rem;
      box-shadow: 0 3px 10px rgba(72, 95, 199, 0.1);
      border-left: 5px solid #485fc7;
      position: relative;
    }

    .text-content-box::before {
      content: "";
      position: absolute;
      top: 0;
      left: 0;
      width: 100%;
      height: 100%;
      background: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="100" height="100" viewBox="0 0 100 100"><path fill="%23485fc7" fill-opacity="0.03" d="M0 0h100v100H0z"/></svg>');
      opacity: 0.5;
      border-radius: 12px;
      z-index: 0;
    }

    .text-content-box p {
      position: relative;
      z-index: 1;
      line-height: 1.7;
    }

    /* Avatar replacement */
    .author-avatar {
      width: 40px;
      height: 40px;
      border-radius: 50%;
      background-color: #485fc7;
      display: flex;
      align-items: center;
      justify-content: center;
      color: white;
      font-weight: bold;
      font-size: 18px;
      margin-right: 10px;
    }

    /* Gradient text for section titles */
    .gradient-title {
      background: linear-gradient(90deg, #485fc7, #8c9eff);
      -webkit-background-clip: text;
      background-clip: text;
      color: transparent;
      display: inline-block;
      font-weight: 700;
      margin-bottom: 1rem;
    }
  </style>
</head>
<body>

  <section class="hero">
    <div class="hero-body">
      <div class="container is-max-desktop">
        <div class="columns is-centered">
          <div class="column has-text-centered">
            <h1 class="title is-1 publication-title">Large Language Model Psychometrics: A Systematic Review of Evaluation, Validation, and Enhancement</h1>


            <div class="is-size-5 publication-authors authors-section">
              <!-- Paper authors -->
               <span class="author-block">
                <a href="https://yehaoran.info" target="_blank">Haoran&nbsp;Ye<sup>1</sup></a>,
              </span>
              <span class="author-block">
                <a href="#" target="_blank">Jing&nbsp;Jin<sup>1</sup></a>,
              </span>
              <span class="author-block">
                <a href="#" target="_blank">Yuhang&nbsp;Xie<sup>1</sup></a>,
              </span>
              <span class="author-block">
                <a href="https://www.psy.pku.edu.cn/szdw/qzjy/fjs/zx/index.htm" target="_blank">Xin&nbsp;Zhang<sup>2,3</sup></a>,
              </span>
              <span class="author-block">
                <a href="https://www.cis.pku.edu.cn/info/1362/2256.htm" target="_blank">Guojie&nbsp;Song<sup>1,4</sup></a>
              </span>
            </div>

            <div class="is-size-5 publication-authors">
              <span class="author-block"><sup>1</sup>State Key Laboratory of General Artificial Intelligence, School of Intelligence Science and Technology, Peking University</span><br>
              <span class="author-block"><sup>2</sup>School of Psychological and Cognitive Sciences, Peking University</span><br>
              <span class="author-block"><sup>3</sup>Key Laboratory of Machine Perception (Ministry of Education), Peking University</span><br>
              <span class="author-block"><sup>4</sup>PKU‑Wuhan Institute for Artificial Intelligence</span>
            </div>

            <div class="column has-text-centered">
              <div class="publication-links">
                <!-- Paper PDF link -->
                <span class="link-block">
                  <a href="https://arxiv.org/abs/2505.08245" target="_blank" class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="fas fa-file-pdf"></i>
                    </span>
                    <span>Paper</span>
                  </a>
                </span>

                <!-- Github link -->
                <span class="link-block">
                  <a href="https://github.com/ValueByte-AI/Awesome-LLM-Psychometrics" target="_blank" class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="fab fa-github"></i>
                    </span>
                    <span>Awesome</span>
                  </a>
                </span>
              </div>
            </div>
          </div>
        </div>
      </div>
    </div>
  </section>

  <div class="has-text-centered" style="margin-top: -1.0rem; margin-bottom: 1.5rem;">
    <img src="static/images/fig.png" alt="LLM Psychometrics Logo" style="max-width: 265px; width: 120%; height: auto;">
  </div>

  <!-- Main overview image at the top -->
  <section class="section main-image-section">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column">
          <div class="image-container top-overview-image">
            <figure class="image">
              <img src="static/images/overview_000.png" alt="Overview of LLM Psychometrics Framework">
            </figure>
            <p class="image-caption">Figure 1: Overview of our survey on LLM Psychometrics.</p>
          </div>
        </div>
      </div>
    </div>
  </section>

  <!-- Abstract Section without Background Color -->
<section class="section">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column is-12">
        <h2 class="title is-3 has-text-centered" style="color: #333;">Abstract</h2>
        <div class="abstract content" style="color: #555;">
          <p>
            The rapid advancement of large language models (LLMs) has outpaced traditional evaluation methodologies. It presents novel challenges, such as measuring human-like psychological constructs, navigating beyond static and task-specific benchmarks, and establishing human-centered evaluation. These challenges intersect with Psychometrics, the science of quantifying the intangible aspects of human psychology, such as personality, values, and intelligence. This survey introduces and synthesizes an emerging interdisciplinary field of LLM Psychometrics, which leverages psychometric instruments, theories, and principles to evaluate, understand, and enhance LLMs. We systematically explore the role of Psychometrics in shaping benchmarking principles, broadening evaluation scopes, refining methodologies, validating results, and advancing LLM capabilities. This paper integrates diverse perspectives to provide a structured framework for researchers across disciplines, enabling a more comprehensive understanding of this nascent field. Ultimately, we aim to provide actionable insights for developing future evaluation paradigms that align with human-level AI and promote the advancement of human-centered AI systems for societal benefit. A curated repository of LLM psychometric resources is available at
            <a href="https://github.com/valuebyte-ai/Awesome-LLM-Psychometrics" target="_blank" class="has-text-link">
              https://github.com/valuebyte-ai/Awesome-LLM-Psychometrics
            </a>.
          </p>
        </div>
      </div>
    </div>
  </div>
</section>


  <!-- Comparison Table Section (Replacing Key Features) -->
  <section class="section">
    <div class="container is-max-desktop">
      <h2 class="title is-3 has-text-centered">Comparison: Psychometrics vs AI Benchmarks</h2>

      <div class="table-caption">Table 1: Systematic comparison between psychometric evaluation and conventional AI benchmarking approaches.</div>

      <div class="table-container">
        <table class="comparison-table">
          <thead>
            <tr>
              <th>Feature</th>
              <th>Psychometrics</th>
              <th>AI Benchmark</th>
            </tr>
          </thead>
          <tbody>
            <tr>
              <td>Core goal</td>
              <td>To measure psychological constructs, to prove that a test measures as intended (validity evidence), and to understand the construct being measured.</td>
              <td>To test and compare the task performance of different LLMs. Focuses on ranking models and selecting the best one suited for a specific task.</td>
            </tr>
            <tr>
              <td>Philosophy of measurement</td>
              <td>Construct-oriented. Tends towards a causal approach to measurement, where the measured trait is believed to cause the measurement outcomes.</td>
              <td>Task-oriented. Leans towards representativism, assuming items exhaust or represent all aspects of the underlying ability.</td>
            </tr>
            <tr>
              <td>Target construct</td>
              <td>Personality and ability.</td>
              <td>Mostly task-specific abilities.</td>
            </tr>
            <tr>
              <td>Construct definition</td>
              <td>Emphasizes clear and detailed definitions of the construct being measured. Agreement on the construct definition is a byproduct of test development.</td>
              <td>Often defines constructs implicitly through ad hoc task selection. Construct definitions can be vague.</td>
            </tr>
            <tr>
              <td>Development process</td>
              <td>Systematic and rigorous, often following methods like Evidence-Centered Design (ECD). Can be labor-intensive.</td>
              <td>Compiles a set of relevant questions or tasks, then performs expert annotation or crowdsourcing to label ground truth answers. Less labor-intensive per item.</td>
            </tr>
            <tr>
              <td>Number of items</td>
              <td>Can vary, but not necessarily large. Focus is on item quality and relevance to the construct.</td>
              <td>Typically consists of an extensive number of questions to cover various aspects of abilities. Reliability increases with test length.</td>
            </tr>
            <tr>
              <td>Sample size</td>
              <td>Typically requires a larger sample size of test takers for robust statistical modeling.</td>
              <td>Can be applied to evaluate the performance of a single LLM on the benchmark.</td>
            </tr>
            <tr>
              <td>Statistical modeling</td>
              <td>Employs advanced and various statistical models like Item Response Theory and Factor Analysis to analyze data, estimate latent abilities, and assess model fit.</td>
              <td>Often relies on simple aggregation methods, such as calculating average accuracy across benchmark tasks.</td>
            </tr>
            <tr>
              <td>Result analysis</td>
              <td>Ensures the reliability, validity, predictive power, and explanatory power of the test through result analysis and statistical modeling.</td>
              <td>Reliability is likely to be high due to the large number of items. However, validity, predictive power, or explanatory power beyond the target task is not a primary concern.</td>
            </tr>
          </tbody>
        </table>
      </div>
    </div>
  </section>

  <!-- Psychological Constructs Section with Image - Moved text below image with enhanced styling -->
  <section class="section">
    <div class="container is-max-desktop">
      <h2 class="title is-3 has-text-centered">Measuring Psychological Constructs</h2>

      <div class="columns is-centered">
        <div class="column reversed-layout">
          <div class="image-container">
            <figure class="image">
              <img src="static/images/test.png" alt="Examples of psychometric tests for LLMs">
            </figure>
            <p class="image-caption">Figure 2: Examples of psychometric tests for LLMs, showing both personality (left) and cognitive (right) evaluations.</p>
          </div>

          <div class="text-content-box">
            <h4 class="gradient-title">Psychological Constructs in LLM Research</h4>
            <p>
              LLM psychometrics evaluates LLMs in their personality and cognitive constructs. Personality constructs include (1) personality traits based on theories such as Big Five, HEXACO, MBTI, or Dark Triad; (2) values based on theories such as Schwartz, WVS, VSM, and GLOBE; (3) morality based on MFT, DIT, and ETHICS; and (4) attitudes and opinions from political panels like ANES, ATP, GLES, and PCT. In contrast, cognitive constructs include (1) heuristics and biases measured by tasks such as the Cognitive Reflection Test; (2) social interaction abilities—Theory of Mind, Emotional and Social Intelligence; (3) psychology of language covering comprehension, generation, and acquisition; and (4) learning and cognitive capabilities.
            </p>
          </div>
        </div>
      </div>
    </div>
  </section>

  <!-- Methodology Section with Image and enhanced text styling -->
  <section class="section">
    <div class="container is-max-desktop">
      <h2 class="title is-3 has-text-centered">Evaluation Methodology</h2>

      <div class="columns is-centered">
        <div class="column reversed-layout">
          <div class="image-container">
            <figure class="image">
              <img src="static/images/method_00.png" alt="Psychometric Evaluation Methodology">
            </figure>
            <p class="image-caption">Figure 3: Overview of LLM psychometric evaluation methodology, including test formats, data sources, prompting strategies, model outputs, and scoring mechanisms.</p>
          </div>

          <div class="text-content-box">
            <h4 class="gradient-title">Evaluation Methodologies for LLM Psychometrics</h4>
            <p>
              LLM psychometrics mirrors a classic testing pipeline in some aspects but is more tailored to LLMs. Test formats can range from tightly controlled structured items (forced-choice or Likert) to open-ended conversations and full agentic simulations. Data sources may come from established inventories, custom-curated adaptations, or synthetic prompts automatically generated to extend test coverage. Prompting strategies include perturbing the original question and injecting performance-enhancing or role-playing instructions. Finally, output and scoring modules translate the model's raw text into numerical metrics: logit-based analysis and direct scoring for closed-ended outputs, or rule-based, model-based, or human evaluation for open-ended LLM outputs.
            </p>
          </div>
        </div>
      </div>
    </div>
  </section>

  <!-- Validation Section with Image and enhanced text styling -->
  <section class="section">
    <div class="container is-max-desktop">
      <h2 class="title is-3 has-text-centered">Psychometric Validation</h2>

      <div class="columns is-centered">
        <div class="column reversed-layout">
          <div class="image-container">
            <figure class="image">
              <img src="static/images/validation_00.png" alt="Overview of psychometric validation approaches">
            </figure>
            <p class="image-caption">Figure 4: Overview of psychometric validation: reliability and consistency, validity, and standards and recommendations.</p>
          </div>

          <div class="text-content-box">
            <h4 class="gradient-title">Validation Framework for LLM Psychometrics</h4>
            <p>
              Applying psychometrics to LLMs requires validation. Reliability is assessed through test-retest reliability, parallel forms reliability, and inter-rater reliability when subjective coding is involved. Validity evidence is gathered on multiple fronts: content (guarding against training data contamination or item under-representativeness), construct (ensuring responses reflect the intended latent trait rather than confounding factors such as response sets or social desirability bias), and criterion or ecological correspondence with external benchmarks. We also gather emerging standards, such as non-disclosure of test materials, fairness across languages and cultures, and the suitability of tests for model capabilities.
            </p>
          </div>
        </div>
      </div>
    </div>
  </section>

  <!-- LLM Enhancement Section - Added highlighted background -->
  <section class="section">
    <div class="container is-max-desktop">
      <div class="highlighted-section">
        <h2 class="title is-3 has-text-centered">LLM Enhancement Techniques</h2>

        <div class="columns is-centered">
          <div class="column">
            <div class="content">
              <p>
                Psychometrics also serves as powerful tools for model enhancement across three key domains:
              </p>

              <ul>
                <li><span class="highlight">Trait Manipulation</span>: Controlling LLM traits through prompting, inference-time interventions, and fine-tuning.</li>
                <li><span class="highlight">Safety and Alignment</span>: Leveraging psychometrics to guide LLM value alignment and improve safety.</li>
                <li><span class="highlight">Cognitive Enhancement</span>: Developing stronger or more human-like reasoning, empathy, and communication capabilities.</li>
              </ul>

            </div>
          </div>
        </div>
      </div>
    </div>
  </section>

  <!-- Future Directions Section - Added highlighted background -->
  <section class="section">
    <div class="container is-max-desktop">
      <div class="highlighted-section">
        <h2 class="title is-3 has-text-centered">Future Directions</h2>

        <div class="columns is-centered">
          <div class="column">
            <div class="content">
              <p>
                Our survey identifies several emerging trends, challenges, and future directions for LLM psychometrics research:
              </p>

              <ul>
                <li><span class="highlight">Psychometric Validation</span>: Establish rigorous reliability and validity checks.</li>
                <li><span class="highlight">From Human Constructs to LLM Constructs</span>: Tailor psychological constructs for LLMs.</li>
                <li><span class="highlight">Perceived vs. Aligned Traits</span>: Distinguish between traits that humans perceive from LLM outputs and those aligned with human self-views.</li>
                <li><span class="highlight">Anthropomorphization Challenges</span>: Properly anthropomorphizing LLMs in psychometric tests remains a subject of academic debate.</li>
                <li><span class="highlight">Expanding Dimensions in Model Deployment</span>: Extend evaluations to multilingual, multi-turn, multimodal, agent, and multi-agent contexts where new validity issues emerge.</li>
                <li><span class="highlight">Item Response Theory</span>: Adopt IRT models to improve LLM evaluation.</li>
                <li><span class="highlight">From Evaluation to Enhancement</span>: Leverage psychometrics to enhance and align LLMs.</li>
              </ul>
            </div>
          </div>
        </div>
      </div>
    </div>
  </section>

  <!-- Citation Section -->
  <section class="section">
    <div class="container is-max-desktop">
      <h2 class="title is-3 has-text-centered">Citation</h2>

      <div class="columns is-centered">
        <div class="column">
          <div class="citation-box" style="position: relative;">
            <button onclick="copyCitation()" style="position: absolute; top: 10px; right: 10px; background: transparent; border: none; cursor: pointer;">
              <i class="fas fa-copy"></i>
            </button>
            <pre>
@article{ye2025large,
  title={Large Language Model Psychometrics: A Systematic Review of Evaluation, Validation, and Enhancement},
  author={Ye, Haoran and Jin, Jing and Xie, Yuhang and Zhang, Xin and Song, Guojie},
  journal={arXiv preprint arXiv:2505.08245},
  year={2025},
  note={Project website: \url{https://llm-psychometrics.com}, GitHub: \url{https://github.com/ValueByte-AI/Awesome-LLM-Psychometrics}}
}
            </pre>
            <script>
              function copyCitation() {
                const citationText = `@article{ye2025large,
  title={Large Language Model Psychometrics: A Systematic Review of Evaluation, Validation, and Enhancement},
  author={Ye, Haoran and Jin, Jing and Xie, Yuhang and Zhang, Xin and Song, Guojie},
  journal={arXiv preprint arXiv:2505.08245},
  year={2025},
  note={Project website: \\url{https://llm-psychometrics.com}, GitHub: \\url{https://github.com/ValueByte-AI/Awesome-LLM-Psychometrics}}
}`;
                navigator.clipboard.writeText(citationText).then(() => {
                  alert('Citation copied to clipboard!');
                }, (err) => {
                  console.error('Could not copy text: ', err);
                });
              }
            </script>
          </div>
        </div>
      </div>
    </div>
  </section>

  <!-- Footer -->
  <footer class="footer">
    <div class="container">
      <div class="content has-text-centered">
        <p>
          Website template based on <a href="https://github.com/eliahuhorwitz/Academic-project-page-template" target="_blank">Academic Project Page Template</a>
        </p>
      </div>
    </div>
  </footer>

  <!-- Custom Script to replace the default avatar with our custom one -->
  <script>
    // This script would run on page load to replace any avatar images
    document.addEventListener('DOMContentLoaded', function() {
      // Replace avatar images if they exist
      const avatarImages = document.querySelectorAll('.user-avatar img');
      avatarImages.forEach(img => {
        // Create a custom avatar element to replace the image
        const customAvatar = document.createElement('div');
        customAvatar.className = 'author-avatar';
        customAvatar.innerHTML = 'LLM';

        // Replace the img with our custom avatar
        const parentElement = img.parentNode;
        parentElement.replaceChild(customAvatar, img);
      });
    });
  </script>

</body>
</html>