mirror of
https://github.com/vitorpamplona/amethyst.git
synced 2025-03-17 21:31:57 +01:00
Improves the speed of the text parser.
This commit is contained in:
parent
5886c866d3
commit
5b77e39c8b
@ -0,0 +1,194 @@
|
||||
/**
|
||||
* Copyright (c) 2024 Vitor Pamplona
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
* this software and associated documentation files (the "Software"), to deal in
|
||||
* the Software without restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the
|
||||
* Software, and to permit persons to whom the Software is furnished to do so,
|
||||
* subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
|
||||
* AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
||||
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
package com.vitorpamplona.amethyst.benchmark
|
||||
|
||||
import androidx.benchmark.junit4.BenchmarkRule
|
||||
import androidx.benchmark.junit4.measureRepeated
|
||||
import androidx.test.ext.junit.runners.AndroidJUnit4
|
||||
import com.linkedin.urls.detection.UrlDetector
|
||||
import com.linkedin.urls.detection.UrlDetectorOptions
|
||||
import com.vitorpamplona.amethyst.commons.HashTagSegment
|
||||
import com.vitorpamplona.amethyst.commons.ImageSegment
|
||||
import com.vitorpamplona.amethyst.commons.LinkSegment
|
||||
import com.vitorpamplona.amethyst.commons.RichTextParser
|
||||
import com.vitorpamplona.quartz.events.EmptyTagList
|
||||
import junit.framework.TestCase.assertNull
|
||||
import junit.framework.TestCase.assertTrue
|
||||
import org.junit.Rule
|
||||
import org.junit.Test
|
||||
import org.junit.runner.RunWith
|
||||
|
||||
@RunWith(AndroidJUnit4::class)
|
||||
class RichTextParserBenchmark {
|
||||
@get:Rule
|
||||
val benchmarkRule = BenchmarkRule()
|
||||
|
||||
@Test
|
||||
fun parseApkUrl() {
|
||||
benchmarkRule.measureRepeated {
|
||||
assertNull(
|
||||
RichTextParser().parseMediaUrl(
|
||||
"https://github.com/vitorpamplona/amethyst/releases/download/v0.83.10/amethyst-googleplay-universal-v0.83.10.apk",
|
||||
EmptyTagList,
|
||||
),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
fun parseImageUrl() {
|
||||
benchmarkRule.measureRepeated {
|
||||
assertTrue(
|
||||
RichTextParser().parseText(
|
||||
"first https://m.primal.net/HeKw.jpg second",
|
||||
EmptyTagList,
|
||||
).paragraphs[0].words[1] is ImageSegment,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
fun parseNoSchemeUrl() {
|
||||
benchmarkRule.measureRepeated {
|
||||
assertTrue(
|
||||
RichTextParser().parseText(
|
||||
"first amethyst.social second",
|
||||
EmptyTagList,
|
||||
).paragraphs[0].words[1] is LinkSegment,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
fun parseHashtag() {
|
||||
benchmarkRule.measureRepeated {
|
||||
assertTrue(
|
||||
RichTextParser().parseText(
|
||||
"first #amethyst second",
|
||||
EmptyTagList,
|
||||
).paragraphs[0].words[1] is HashTagSegment,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
fun computeTestCase1All() {
|
||||
benchmarkRule.measureRepeated {
|
||||
RichTextParser().parseText(testCase1, EmptyTagList)
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
fun computeTestCase2All() {
|
||||
benchmarkRule.measureRepeated {
|
||||
RichTextParser().parseText(testCase2, EmptyTagList)
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
fun computeTestCase2UrlDetector() {
|
||||
benchmarkRule.measureRepeated {
|
||||
UrlDetector(testCase2, UrlDetectorOptions.Default).detect()
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
fun computeTestCase2ParseUrls() {
|
||||
benchmarkRule.measureRepeated {
|
||||
RichTextParser().parseValidUrls(testCase2)
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
fun computeTestCase3All() {
|
||||
benchmarkRule.measureRepeated {
|
||||
RichTextParser().parseText(testCase3, EmptyTagList)
|
||||
}
|
||||
}
|
||||
|
||||
val testCase1 = """
|
||||
#Amethyst v0.83.10
|
||||
|
||||
تحديث جديد لـ Amethyst بإصدار 0.83.10 مع تعديلات وإضافات جديدة
|
||||
|
||||
: NIP-92 إصلاحات الأخطاء
|
||||
|
||||
الإضافات الجديدة:
|
||||
- يتضمن رابط المنتج في الرسالة الأولى من المشتري في السوق
|
||||
- يضيف دعمًا لـ NIP-92 في الرسائل العامة والرسائل المباشرة الجديدة (NIP-17). يبقى NIP-54 في NIP-04 DMs
|
||||
- إضافة التمرير الأفقي إلى أزرار الإجراءات في شاشة النشر الجديد لإصلاح الأزرار المخفية جزئيًا في الشاشات الصغيرة/الرفيعة.
|
||||
|
||||
اصلاحات الشوائب:
|
||||
- إصلاحات التعطل مع مبلغ Zap مخصص غير صالح
|
||||
- يعمل على إصلاح مشكلات إعادة اتصال التتابع عندما يقوم المرحل بإغلاق الاتصال
|
||||
- إصلاح الحشو العلوي للملاحظة المقتبسة في المنشور
|
||||
- تحسين استخدام الذاكرة للمستخدم المرئي وعلامة URL في المشاركات الجديدة
|
||||
|
||||
الترجمات المحدثة:
|
||||
- الفارسية بواسطة
|
||||
- الفرنسية والإنجليزية، المملكة المتحدة بواسطة
|
||||
- الأوكرانية
|
||||
- الإسبانية والإسبانية والمكسيك والإسبانية والولايات المتحدة بواسطة
|
||||
- العربية
|
||||
|
||||
تحسينات جودة الكود:
|
||||
- تحديثات لنظام Android Studio 2023.1.1 Patch 2
|
||||
|
||||
|
||||
|
||||
|
||||
nostr:nevent1qqszq7kl888sw0c5rpvepn8w373zt0jrw8864x8lkauxxw335s66rzgppemhxue69uhkummn9ekx7mp0qgsyvrp9u6p0mfur9dfdru3d853tx9mdjuhkphxuxgfwmryja7zsvhqrqsqqqqqpaax7m2
|
||||
"""
|
||||
|
||||
val testCase2 = """
|
||||
#Amethyst v0.83.10: NIP-92 and Bug Fixes
|
||||
|
||||
New Additions:
|
||||
- Includes a link to the product in the first message from the buyer in the marketplace
|
||||
- Adds support for NIP-92 in public messages and new DMs (NIP-17). NIP-54 stays in NIP-04 DMs
|
||||
- Adds Horizontal Scroll to the action buttons in the New Post screen to partially fix hidden buttons in small/thin screens.
|
||||
|
||||
Bugfixes:
|
||||
- Fixes crash with an invalid custom Zap Amount
|
||||
- Fixes relay re-connection issues when the relay closes a connection
|
||||
- Fixes the top padding of the quoted note in a post
|
||||
- Optimizes memory use of the visual user and url tagger in new posts
|
||||
|
||||
Updated translations:
|
||||
- Persian by nostr:npub1cpazafytvafazxkjn43zjfwtfzatfz508r54f6z6a3rf2ws8223qc3xxpk
|
||||
- French and English, United Kingdom by nostr:npub13qtw3yu0uc9r4yj5x0rhgy8nj5q0uyeq0pavkgt9ly69uuzxgkfqwvx23t
|
||||
- Ukrainian by crowdin.com/profile/liizzzz
|
||||
- Spanish, Spanish, Mexico and Spanish, United States by nostr:npub1luhyzgce7qtcs6r6v00ryjxza8av8u4dzh3avg0zks38tjktnmxspxq903
|
||||
- Arabic by nostr:npub13qtw3yu0uc9r4yj5x0rhgy8nj5q0uyeq0pavkgt9ly69uuzxgkfqwvx23t
|
||||
|
||||
Code Quality Improvements:
|
||||
- Updates to Android Studio 2023.1.1 Patch 2
|
||||
|
||||
Download:
|
||||
- [Play Edition](https://github.com/vitorpamplona/amethyst/releases/download/v0.83.10/amethyst-googleplay-universal-v0.83.10.apk )
|
||||
- [FOSS Edition - No translations](https://github.com/vitorpamplona/amethyst/releases/download/v0.83.10/amethyst-fdroid-universal-v0.83.10.apk )
|
||||
"""
|
||||
|
||||
val testCase3 = """#100aDayUntil100k
|
||||
Day 5 ✔️
|
||||
|
||||
Seems like they may be getting easier"""
|
||||
}
|
@ -74,29 +74,35 @@ class RichTextParser() {
|
||||
}
|
||||
}
|
||||
|
||||
fun parseValidUrls(content: String): LinkedHashSet<String> {
|
||||
val urls = UrlDetector(content, UrlDetectorOptions.Default).detect()
|
||||
|
||||
return urls.mapNotNullTo(LinkedHashSet(urls.size)) {
|
||||
if (it.originalUrl.contains("@")) {
|
||||
if (Patterns.EMAIL_ADDRESS.matcher(it.originalUrl).matches()) {
|
||||
null
|
||||
} else {
|
||||
it.originalUrl
|
||||
}
|
||||
} else if (isNumber(it.originalUrl)) {
|
||||
null // avoids urls that look like 123.22
|
||||
} else if (it.originalUrl.contains("。")) {
|
||||
null // avoids Japanese characters as fake urls
|
||||
} else {
|
||||
if (HTTPRegex.matches(it.originalUrl)) {
|
||||
it.originalUrl
|
||||
} else {
|
||||
null
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fun parseText(
|
||||
content: String,
|
||||
tags: ImmutableListOfLists<String>,
|
||||
): RichTextViewerState {
|
||||
val urls = UrlDetector(content, UrlDetectorOptions.Default).detect()
|
||||
|
||||
val urlSet =
|
||||
urls.mapNotNullTo(LinkedHashSet(urls.size)) {
|
||||
// removes e-mails
|
||||
if (Patterns.EMAIL_ADDRESS.matcher(it.originalUrl).matches()) {
|
||||
null
|
||||
} else if (isNumber(it.originalUrl)) {
|
||||
null
|
||||
} else if (it.originalUrl.contains("。")) {
|
||||
null
|
||||
} else {
|
||||
if (HTTPRegex.matches(it.originalUrl)) {
|
||||
it.originalUrl
|
||||
} else {
|
||||
null
|
||||
}
|
||||
}
|
||||
}
|
||||
val urlSet = parseValidUrls(content)
|
||||
|
||||
val imagesForPager =
|
||||
urlSet.mapNotNull { fullUrl -> parseMediaUrl(fullUrl, tags) }.associateBy { it.url }
|
||||
@ -153,8 +159,29 @@ class RichTextParser() {
|
||||
return paragraphSegments.toImmutableList()
|
||||
}
|
||||
|
||||
fun isNumber(word: String): Boolean {
|
||||
return numberPattern.matcher(word).matches()
|
||||
private fun isNumber(word: String) = numberPattern.matcher(word).matches()
|
||||
|
||||
private fun isPhoneNumberChar(c: Char): Boolean {
|
||||
return when (c) {
|
||||
in '0'..'9' -> true
|
||||
'-' -> true
|
||||
' ' -> true
|
||||
'.' -> true
|
||||
else -> false
|
||||
}
|
||||
}
|
||||
|
||||
fun isPotentialPhoneNumber(word: String): Boolean {
|
||||
if (word.length !in 7..14) return false
|
||||
var isPotentialNumber = true
|
||||
|
||||
for (c in word) {
|
||||
if (!isPhoneNumberChar(c)) {
|
||||
isPotentialNumber = false
|
||||
break
|
||||
}
|
||||
}
|
||||
return isPotentialNumber
|
||||
}
|
||||
|
||||
fun isDate(word: String): Boolean {
|
||||
@ -172,46 +199,48 @@ class RichTextParser() {
|
||||
emojis: Map<String, String>,
|
||||
tags: ImmutableListOfLists<String>,
|
||||
): Segment {
|
||||
val emailMatcher = Patterns.EMAIL_ADDRESS.matcher(word)
|
||||
val phoneMatcher = Patterns.PHONE.matcher(word)
|
||||
val schemelessMatcher = noProtocolUrlValidator.matcher(word)
|
||||
if (word.isEmpty()) return RegularTextSegment(word)
|
||||
|
||||
return if (word.isEmpty()) {
|
||||
RegularTextSegment(word)
|
||||
} else if (images.contains(word)) {
|
||||
ImageSegment(word)
|
||||
} else if (urls.contains(word)) {
|
||||
LinkSegment(word)
|
||||
} else if (emojis.any { word.contains(it.key) }) {
|
||||
EmojiSegment(word)
|
||||
} else if (word.startsWith("lnbc", true)) {
|
||||
InvoiceSegment(word)
|
||||
} else if (word.startsWith("lnurl", true)) {
|
||||
WithdrawSegment(word)
|
||||
} else if (word.startsWith("cashuA", true)) {
|
||||
CashuSegment(word)
|
||||
} else if (emailMatcher.matches()) {
|
||||
EmailSegment(word)
|
||||
} else if (word.length in 7..14 && !isDate(word) && phoneMatcher.matches()) {
|
||||
PhoneSegment(word)
|
||||
} else if (startsWithNIP19Scheme(word)) {
|
||||
BechSegment(word)
|
||||
} else if (word.startsWith("#")) {
|
||||
parseHash(word, tags)
|
||||
} else if (word.contains(".") && schemelessMatcher.find()) {
|
||||
val url = schemelessMatcher.group(1) // url
|
||||
val additionalChars = schemelessMatcher.group(4).ifEmpty { null } // additional chars
|
||||
val pattern =
|
||||
"""^([A-Za-z0-9-_]+(\.[A-Za-z0-9-_]+)+)(:[0-9]+)?(/[^?#]*)?(\?[^#]*)?(#.*)?"""
|
||||
.toRegex(RegexOption.IGNORE_CASE)
|
||||
if (pattern.find(word) != null) {
|
||||
SchemelessUrlSegment(word, url, additionalChars)
|
||||
} else {
|
||||
RegularTextSegment(word)
|
||||
}
|
||||
} else {
|
||||
RegularTextSegment(word)
|
||||
if (images.contains(word)) return ImageSegment(word)
|
||||
|
||||
if (urls.contains(word)) return LinkSegment(word)
|
||||
|
||||
if (word.startsWith(":") && emojis.any { word.contains(it.key) }) return EmojiSegment(word)
|
||||
|
||||
if (word.startsWith("lnbc", true)) return InvoiceSegment(word)
|
||||
|
||||
if (word.startsWith("lnurl", true)) return WithdrawSegment(word)
|
||||
|
||||
if (word.startsWith("cashuA", true)) return CashuSegment(word)
|
||||
|
||||
if (startsWithNIP19Scheme(word)) return BechSegment(word)
|
||||
|
||||
if (word.startsWith("#")) return parseHash(word, tags)
|
||||
|
||||
if (word.contains("@")) {
|
||||
if (Patterns.EMAIL_ADDRESS.matcher(word).matches()) return EmailSegment(word)
|
||||
}
|
||||
|
||||
if (isPotentialPhoneNumber(word) && !isDate(word)) {
|
||||
if (Patterns.PHONE.matcher(word).matches()) return PhoneSegment(word)
|
||||
}
|
||||
|
||||
val indexOfPeriod = word.indexOf(".")
|
||||
if (indexOfPeriod > 0 && indexOfPeriod < word.length - 1) { // periods cannot be the last one
|
||||
val schemelessMatcher = noProtocolUrlValidator.matcher(word)
|
||||
if (schemelessMatcher.find()) {
|
||||
val url = schemelessMatcher.group(1) // url
|
||||
val additionalChars = schemelessMatcher.group(4).ifEmpty { null } // additional chars
|
||||
val pattern =
|
||||
"""^([A-Za-z0-9-_]+(\.[A-Za-z0-9-_]+)+)(:[0-9]+)?(/[^?#]*)?(\?[^#]*)?(#.*)?"""
|
||||
.toRegex(RegexOption.IGNORE_CASE)
|
||||
if (pattern.find(word) != null && url != null) {
|
||||
return SchemelessUrlSegment(word, url, additionalChars)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return RegularTextSegment(word)
|
||||
}
|
||||
|
||||
private fun parseHash(
|
||||
@ -289,7 +318,11 @@ class RichTextParser() {
|
||||
val hashTagsPattern: Pattern =
|
||||
Pattern.compile("#([^\\s!@#\$%^&*()=+./,\\[{\\]};:'\"?><]+)(.*)", Pattern.CASE_INSENSITIVE)
|
||||
|
||||
val acceptedNIP19schemes = listOf("npub1", "naddr1", "note1", "nprofile1", "nevent1")
|
||||
val acceptedNIP19schemes =
|
||||
listOf("npub1", "naddr1", "note1", "nprofile1", "nevent1") +
|
||||
listOf("npub1", "naddr1", "note1", "nprofile1", "nevent1").map {
|
||||
it.uppercase()
|
||||
}
|
||||
|
||||
private fun removeQueryParamsForExtensionComparison(fullUrl: String): String {
|
||||
return if (fullUrl.contains("?")) {
|
||||
@ -344,9 +377,18 @@ class RichTextParser() {
|
||||
}
|
||||
|
||||
fun startsWithNIP19Scheme(word: String): Boolean {
|
||||
val cleaned = word.lowercase().removePrefix("@").removePrefix("nostr:").removePrefix("@")
|
||||
|
||||
return acceptedNIP19schemes.any { cleaned.startsWith(it) }
|
||||
if (word.isEmpty()) return false
|
||||
return if (word[0] == 'n' || word[0] == 'N') {
|
||||
if (word.startsWith("nostr:n") || word.startsWith("NOSTR:N")) {
|
||||
acceptedNIP19schemes.any { word.startsWith(it, 6) }
|
||||
} else {
|
||||
acceptedNIP19schemes.any { word.startsWith(it) }
|
||||
}
|
||||
} else if (word[0] == '@') {
|
||||
acceptedNIP19schemes.any { word.startsWith(it, 1) }
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fun isUrlWithoutScheme(url: String) = noProtocolUrlValidator.matcher(url).matches()
|
||||
|
Loading…
x
Reference in New Issue
Block a user