提问人:Sankar 提问时间:10/18/2023 更新时间:10/18/2023 访问量:26
C# 如何从Web浏览器控件的选定文本中提取html表格
C# How to extract html table from selected text of web browser control
问:
我有一个 Web 浏览器控件,网站将在其中加载。网页有许多表格数据,用户将选择这些数据,并需要解析这些数据并显示在 DataGridView 中。
这是我从 Web 浏览器控件中提取选定文本的方式。
private string GetSelectedText()
{
dynamic document = webBrowser1.Document.DomDocument;
dynamic selection = document.selection;
dynamic text = selection.createRange().text;
return (string)text;
}
现在,从选定的文本中正确提取数据变得越来越困难。所以我的问题是是否可以从选定的文本中获取 html 数据?
这些是我的站点,我需要从中解析数据。https://www.sec.gov/Archives/edgar/data/1108134/000110813423000018/bhlb-20230630.htm https://www.sec.gov/Archives/edgar/data/66740/000006674023000058/mmm-20230630.htm
这是我当前的例程,我用它来解析选定的数据,但不是很好的方式。
public string SelectedText { get; set; }
private void Form2_Load(object sender, EventArgs e)
{
bool startparse = false;
int colCounter = 1;
DataTable dt = new DataTable();
string selectedtext = SelectedText;
string[] lines = null;
List<string> colvalues = null;
//list of char need to replace from selected line item name
// storing new lineitem & carriage return
string[] stringSeparators = new string[] { "\r\n" };
char[] patternone = new char[] { '%', '€', ';', ',', '.', '$', '£', '(', ')' };
#region Data parsing logic from browser & storing into datatable
//splitting selected text
lines = selectedtext.Split(stringSeparators, StringSplitOptions.None);
List<string> columns = null;
string strLeftColumnName = "";
string tmp = "";
string lineitem = "", strValues = "", strTmpdata, strNewValues = "";
#region Extract data for each rows
foreach (string s in lines)
{
columns = null;
tmp = "";
lineitem = "";
strValues = "";
strTmpdata = "";
strNewValues = "";
#region Extract data for building columns
foreach (string line in lines)
{
tmp = line;
//if (line.Contains("Dollars in millions"))
//{
var match = Regex.Match(line, "\\(\\D*\\)", RegexOptions.IgnoreCase);
if (match.Success)
{
strLeftColumnName = match.Groups[0].Value;
}
tmp = tmp.Trim().Replace(strLeftColumnName.Trim(), "");
columns = tmp.Trim().Split(new char[] { ' ' }).ToList();
columns.Insert(0, strLeftColumnName);
break;
//}
}
#endregion
#region Build Datagrid columns
if (columns != null && columns.Count > 0)
{
if (dgv.Columns.Count < columns.Count)
{
foreach (string col in columns)
{
if (col.All(char.IsNumber))
{
dgv.Columns.Add("col_" + colCounter, "");
dgv.Columns["col_" + colCounter].SortMode = DataGridViewColumnSortMode.NotSortable;
}
else
{
dgv.Columns.Add("col_" + colCounter, "");
dgv.Columns["col_" + colCounter].SortMode = DataGridViewColumnSortMode.NotSortable;
}
colCounter++;
}
}
}
#endregion
if (s != "" && (s.Contains("Dollars in millions") || startparse))
{
if (s.Contains("Net changes related to available-for-sale securities"))
{
}
strTmpdata = s;
//Here storing lineitem name
lineitem = Regex.Replace(s.Trim(), @"[\d-1]", string.Empty);
//lineitem = Regex.Replace(s.Trim(), @"[^A-Za-z0-9 -]", string.Empty);
lineitem = ReplaceMultipleChar(lineitem, patternone, string.Empty);
lineitem = lineitem.Trim();
if (lineitem != "")
{
//here split numeric data only
if (strTmpdata.Length > lineitem.Length)
{
//strValues = strTmpdata.Substring(lineitem.Length, (strTmpdata.Length - lineitem.Length));
//lineitem = Regex.Escape(lineitem);
//strTmpdata =Regex.Escape(strTmpdata);
//strTmpdata = Regex.Replace(strTmpdata, lineitem, "");
strTmpdata = GetNumericData(strTmpdata);
//strValues = ReplaceWholeWord( strTmpdata, lineitem,"");
strValues = strTmpdata.Trim();
strValues = strValues.Replace("(", "-").Replace(")", " ").Replace(",", "").Trim();
//strNewValues = strValues;
//for (int i = 0; i < strValues.Length; i++)
//{
// if (Char.IsDigit(strValues[i]) || strValues[i] == '-' || strValues[i] == ' ' || strValues[i] == '.')
// strNewValues += strValues[i];
//}
}
//strValues = strNewValues.Trim();
colvalues = strValues.Trim().Split(new char[] { ' ' }).ToList();
if (colvalues.Count > 0)
{
colvalues.Insert(0, lineitem);
dgv.Rows.Add(colvalues.ToArray());
}
}
startparse = true;
}
}
#endregion
#endregion
}
private string GetNumericData(string input)
{
string output = "";
for (int i = 0; i < input.Length; i++)
{
if (input[i] == '3')
{
}
if (input[i] == '.' || input[i] == ' ' || input[i] == '-' || input[i] == '(' || input[i] == ')' || Char.IsDigit(input[i]))
{
if (input[i] == '(' && Char.IsDigit(input[i + 1]) && (i + 1) < input.Length)
{
output += input[i];
}
else if (input[i] == ')' && Char.IsDigit(input[i - 1]) && i > 0)
{
output += input[i];
}
else if (input[i] == '.' || input[i] == '-' || Char.IsDigit(input[i]) || input[i] == ' ')
{
output += input[i];
}
}
}
return output;
}
public string ReplaceMultipleChar(string s, char[] separators, string newVal)
{
string[] temp;
temp = s.Split(separators, StringSplitOptions.RemoveEmptyEntries);
return String.Join(newVal, temp);
}
public string ReplaceAll(string s, string separators, string newVal)
{
return Regex.Replace(s, separators, newVal);
}
public string ReplaceWholeWord(string original, string wordToFind, string replacement, RegexOptions regexOptions = RegexOptions.None)
{
string pattern = String.Format(@"\b{0}\b", wordToFind);
string ret = Regex.Replace(original, pattern, replacement, regexOptions);
return ret;
}
请有人帮助我如何从 Web 浏览器控件中获取所选文本的 html,或讨论任何其他解析我需要在 datagridview 中显示的选定表格数据的好方法。
谢谢
答:
0赞
Puygrenier Solann
10/18/2023
#1
这就是网络抓取。一些现有的工具已经为此而制作。
不过,您可以直接使用 js + Xpath 从 Web 浏览器的控制台中提取。
请参阅我已经在 C# + xpath 中使用的相对代码的摘录:
public class MyData {
public string data_1 {get;set;}
public string data_2 {get;set;}
}
//...
string url = "your url";
var client = new RestClient(url);
var request = new RestRequest("", Method.Get);
request.AddHeader("",""); //if needed but for your web site there is a "acceptable policy of automated tools" to set.
var res = client.Execute(request);
if (res.IsSuccessStatusCode is not true) throw new ArgumentException();
HtmlDocument xdc = new HtmlDocument();
string sanitazed = Regex.Replace(res.Content, " ", "");
xdc.LoadHtml(sanitazed);
string mydata1 = xdc.DocumentNode.SelectNodes("you xpath 1")?.First() ?.InnerText ?? "";
string mydata2 = xdc.DocumentNode.SelectNodes("you xpath 2")?.First() ?.InnerText ?? "";
MyData result = new() { data_1 : mydata1, data_1 : mydata2 };
// or alternative for loop,
var nodes = xdc.DocumentNode.SelectNodes("xpath that look like //table//tbody//tr/td[5]/a");
if (nodes is null) throw new Exception(" xpath on error, please check");
foreach (var node in nodes) {
string mydata1 = xdc.DocumentNode.SelectNodes("you xpath 1")?.First() ?.InnerText ?? "";
string mydata2 = xdc.DocumentNode.SelectNodes("you xpath 2")?.First() ?.InnerText ?? "";
// or with attribut
var data1 = node.GetAttributeValue("href","?");
var data2 = node.GetAttributeValue("href","?");
}
评论
string htmlOfSelectedText = webBrowser1.Document.ActiveElement.InnerHtml;